Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -15,6 +15,8 @@ #define LLVM_CODEGEN_PASSES_H #include "llvm/Support/CodeGen.h" +#include "llvm/CodeGen/RegAllocCommon.h" + #include #include @@ -169,16 +171,20 @@ /// possible. It is best suited for debug code where live ranges are short. /// FunctionPass *createFastRegisterAllocator(); + FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F, + bool ClearVirtRegs); /// BasicRegisterAllocation Pass - This pass implements a degenerate global /// register allocator using the basic regalloc framework. /// FunctionPass *createBasicRegisterAllocator(); + FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F); /// Greedy register allocation pass - This pass implements a global register /// allocator for optimized builds. /// FunctionPass *createGreedyRegisterAllocator(); + FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F); /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean /// Quadratic Prograaming (PBQP) based register allocator. Index: llvm/include/llvm/CodeGen/RegAllocCommon.h =================================================================== --- /dev/null +++ llvm/include/llvm/CodeGen/RegAllocCommon.h @@ -0,0 +1,32 @@ +//===- RegAllocCommon.h - Utilities shared between allocators ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCCOMMON_H +#define LLVM_CODEGEN_REGALLOCCOMMON_H + +#include + +namespace llvm { + +class TargetRegisterClass; +class TargetRegisterInfo; + +typedef std::function RegClassFilterFunc; + +/// Default register class filter function for register allocation. All virtual +/// registers should be allocated. +static inline bool allocateAllRegClasses(const TargetRegisterInfo &, + const TargetRegisterClass &) { + return true; +} + +} + +#endif // LLVM_CODEGEN_REGALLOCCOMMON_H Index: llvm/include/llvm/CodeGen/RegAllocRegistry.h =================================================================== --- llvm/include/llvm/CodeGen/RegAllocRegistry.h +++ llvm/include/llvm/CodeGen/RegAllocRegistry.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_REGALLOCREGISTRY_H #define LLVM_CODEGEN_REGALLOCREGISTRY_H +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/MachinePassRegistry.h" namespace llvm { Index: llvm/lib/CodeGen/LiveIntervals.cpp =================================================================== --- llvm/lib/CodeGen/LiveIntervals.cpp +++ llvm/lib/CodeGen/LiveIntervals.cpp @@ -714,10 +714,15 @@ if (LI.empty()) continue; + // Target may have not allocated this yet. + Register PhysReg = VRM->getPhys(Reg); + if (!PhysReg) + continue; + // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) Index: llvm/lib/CodeGen/RegAllocBase.h =================================================================== --- llvm/lib/CodeGen/RegAllocBase.h +++ llvm/lib/CodeGen/RegAllocBase.h @@ -37,6 +37,7 @@ #define LLVM_LIB_CODEGEN_REGALLOCBASE_H #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegisterClassInfo.h" namespace llvm { @@ -67,6 +68,7 @@ LiveIntervals *LIS = nullptr; LiveRegMatrix *Matrix = nullptr; RegisterClassInfo RegClassInfo; + const RegClassFilterFunc ShouldAllocateClass; /// Inst which is a def of an original reg and whose defs are already all /// dead after remat is saved in DeadRemats. The deletion of such inst is @@ -74,7 +76,9 @@ /// always available for the remat of all the siblings of the original reg. SmallPtrSet DeadRemats; - RegAllocBase() = default; + RegAllocBase(const RegClassFilterFunc F = allocateAllRegClasses) : + ShouldAllocateClass(F) {} + virtual ~RegAllocBase() = default; // A RegAlloc pass should call this before allocatePhysRegs. @@ -92,7 +96,10 @@ virtual Spiller &spiller() = 0; /// enqueue - Add VirtReg to the priority queue of unassigned registers. - virtual void enqueue(LiveInterval *LI) = 0; + virtual void enqueueImpl(LiveInterval *LI) = 0; + + /// enqueue - Add VirtReg to the priority queue of unassigned registers. + void enqueue(LiveInterval *LI); /// dequeue - Return the next unassigned register, or NULL. virtual LiveInterval *dequeue() = 0; Index: llvm/lib/CodeGen/RegAllocBase.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocBase.cpp +++ llvm/lib/CodeGen/RegAllocBase.cpp @@ -172,3 +172,21 @@ } DeadRemats.clear(); } + +void RegAllocBase::enqueue(LiveInterval *LI) { + const Register Reg = LI->reg(); + + assert(Reg.isVirtual() && "Can only enqueue virtual registers"); + + if (VRM->hasPhys(Reg)) + return; + + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + if (ShouldAllocateClass(*TRI, RC)) { + LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n'); + enqueueImpl(LI); + } else { + LLVM_DEBUG(dbgs() << "Not enqueueing " << printReg(Reg, TRI) + << " in skipped register class\n"); + } +} Index: llvm/lib/CodeGen/RegAllocBasic.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocBasic.cpp +++ llvm/lib/CodeGen/RegAllocBasic.cpp @@ -76,7 +76,7 @@ void LRE_WillShrinkVirtReg(Register) override; public: - RABasic(); + RABasic(const RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Basic Register Allocator"; } @@ -88,7 +88,7 @@ Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override { + void enqueueImpl(LiveInterval *LI) override { Queue.push(LI); } @@ -171,7 +171,9 @@ enqueue(&LI); } -RABasic::RABasic(): MachineFunctionPass(ID) { +RABasic::RABasic(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -334,7 +336,10 @@ return true; } -FunctionPass* llvm::createBasicRegisterAllocator() -{ +FunctionPass* llvm::createBasicRegisterAllocator() { return new RABasic(); } + +FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) { + return new RABasic(F); +} Index: llvm/lib/CodeGen/RegAllocFast.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocFast.cpp +++ llvm/lib/CodeGen/RegAllocFast.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -69,7 +70,13 @@ public: static char ID; - RegAllocFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + RegAllocFast(const RegClassFilterFunc F = allocateAllRegClasses, + bool ClearVirtRegs_ = true) : + MachineFunctionPass(ID), + ShouldAllocateClass(F), + StackSlotForVirtReg(-1), + ClearVirtRegs(ClearVirtRegs_) { + } private: MachineFrameInfo *MFI; @@ -77,6 +84,7 @@ const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; RegisterClassInfo RegClassInfo; + const RegClassFilterFunc ShouldAllocateClass; /// Basic block currently being allocated. MachineBasicBlock *MBB; @@ -84,6 +92,8 @@ /// Maps virtual regs to the frame index where these values are spilled. IndexedMap StackSlotForVirtReg; + bool ClearVirtRegs; + /// Everything we know about a live virtual register. struct LiveReg { MachineInstr *LastUse = nullptr; ///< Last instr to use reg. @@ -199,8 +209,12 @@ } MachineFunctionProperties getSetProperties() const override { - return MachineFunctionProperties().set( + if (ClearVirtRegs) { + return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); + } + + return MachineFunctionProperties(); } MachineFunctionProperties getClearedProperties() const override { @@ -1473,9 +1487,11 @@ for (MachineBasicBlock &MBB : MF) allocateBasicBlock(MBB); - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers. - MRI->clearVirtRegs(); + if (ClearVirtRegs) { + // All machine operands and other references to virtual registers have been + // replaced. Remove the virtual registers. + MRI->clearVirtRegs(); + } StackSlotForVirtReg.clear(); LiveDbgValueMap.clear(); @@ -1485,3 +1501,9 @@ FunctionPass *llvm::createFastRegisterAllocator() { return new RegAllocFast(); } + +FunctionPass *llvm::createFastRegisterAllocator( + std::function Ftor, bool ClearVirtRegs) { + return new RegAllocFast(Ftor, ClearVirtRegs); +} Index: llvm/lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocGreedy.cpp +++ llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -407,7 +407,7 @@ SmallSetVector SetOfBrokenHints; public: - RAGreedy(); + RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Greedy Register Allocator"; } @@ -416,7 +416,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override; void releaseMemory() override; Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override; + void enqueueImpl(LiveInterval *LI) override; LiveInterval *dequeue() override; MCRegister selectOrSplit(LiveInterval &, SmallVectorImpl &) override; @@ -599,7 +599,22 @@ return new RAGreedy(); } -RAGreedy::RAGreedy(): MachineFunctionPass(ID) { +namespace llvm { +FunctionPass* createGreedyRegisterAllocator( + std::function Ftor); + +} + +FunctionPass* llvm::createGreedyRegisterAllocator( + std::function Ftor) { + return new RAGreedy(Ftor); +} + +RAGreedy::RAGreedy(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { @@ -656,7 +671,7 @@ // Register is assigned, put it back on the queue for reassignment. LiveInterval &LI = LIS->getInterval(VirtReg); Matrix->unassign(LI); - enqueue(&LI); + RegAllocBase::enqueue(&LI); } void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) { @@ -679,7 +694,7 @@ GlobalCand.clear(); } -void RAGreedy::enqueue(LiveInterval *LI) { enqueue(Queue, LI); } +void RAGreedy::enqueueImpl(LiveInterval *LI) { enqueue(Queue, LI); } void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { // Prioritize live ranges by size, assigning larger ranges first. @@ -2924,7 +2939,12 @@ if (Register::isPhysicalRegister(Reg)) continue; - assert(VRM->hasPhys(Reg) && "We have unallocated variable!!"); + // This may be a skipped class + if (!VRM->hasPhys(Reg)) { + assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) && + "We have an unallocated variable which should have been handled"); + continue; + } // Get the live interval mapped with this virtual register to be able // to check for the interference with the new color. Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1168,8 +1168,8 @@ } bool TargetPassConfig::addRegAssignAndRewriteFast() { - if (RegAlloc != &useDefaultRegisterAllocator && - RegAlloc != &createFastRegisterAllocator) + if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator && + RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator) report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addPass(createRegAllocPass(false)); Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -36,6 +36,7 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -57,6 +58,115 @@ using namespace llvm; +namespace { +class SGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +class VGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isSGPRClass(&RC); +} + +static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return !static_cast(TRI).isSGPRClass(&RC); +} + + +/// -{sgpr|vgpr}-regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } + +/// A dummy default pass factory indicates whether the register allocator is +/// overridden on the command line. +static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; +static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; + +static SGPRRegisterRegAlloc +defaultSGPRRegAlloc("default", + "pick SGPR register allocator based on -O option", + useDefaultRegisterAllocator); + +static cl::opt> +SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for SGPRs")); + +static cl::opt> +VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for VGPRs")); + + +static void initializeDefaultSGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = SGPRRegAlloc; + SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); + } +} + +static void initializeDefaultVGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = VGPRRegAlloc; + VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); + } +} + +static FunctionPass *createBasicSGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createGreedySGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createFastSGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +static FunctionPass *createBasicVGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createGreedyVGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createFastVGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateVGPRs, true); +} + +static SGPRRegisterRegAlloc basicRegAllocSGPR( + "basic", "basic register allocator", createBasicSGPRRegisterAllocator); +static SGPRRegisterRegAlloc greedyRegAllocSGPR( + "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); + +static SGPRRegisterRegAlloc fastRegAllocSGPR( + "fast", "fast register allocator", createFastSGPRRegisterAllocator); + + +static VGPRRegisterRegAlloc basicRegAllocVGPR( + "basic", "basic register allocator", createBasicVGPRRegisterAllocator); +static VGPRRegisterRegAlloc greedyRegAllocVGPR( + "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); + +static VGPRRegisterRegAlloc fastRegAllocVGPR( + "fast", "fast register allocator", createFastVGPRRegisterAllocator); +} + + static cl::opt EnableR600StructurizeCFG( "r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), @@ -677,6 +787,14 @@ bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; + + FunctionPass *createSGPRAllocPass(bool Optimized); + FunctionPass *createVGPRAllocPass(bool Optimized); + FunctionPass *createRegAllocPass(bool Optimized) override; + + bool addRegAssignAndRewriteFast() override; + bool addRegAssignAndRewriteOptimized() override; + void addPreRegAlloc() override; bool addPreRewrite() override; void addPostRegAlloc() override; @@ -1032,6 +1150,80 @@ addPass(&GCNNSAReassignID); addPass(&GCNRegBankReassignID); } + + return true; +} + +FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, + initializeDefaultSGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyRegisterAllocator(onlyAllocateSGPRs); + + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, + initializeDefaultVGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyVGPRRegisterAllocator(); + + return createFastVGPRRegisterAllocator(); +} + +FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { + llvm_unreachable("should not be used"); +} + +static const char RegAllocOptNotSupportedMessage[] = + "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; + +bool GCNPassConfig::addRegAssignAndRewriteFast() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(false)); + return true; +} + +bool GCNPassConfig::addRegAssignAndRewriteOptimized() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(true)); + + // Commit allocated register changes. This is mostly necessary because too + // many things rely on the use lists of the physical registers, such as the + // verifier. This is only necessary with allocators which use LiveIntervals, + // since FastRegAlloc does the replacments itself. + addPass(createVirtRegRewriter(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(true)); + + addPreRewrite(); + addPass(&VirtRegRewriterID); + return true; } @@ -1040,9 +1232,6 @@ if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); - - // Equivalent of PEI for SGPRs. - addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -24,6 +24,12 @@ #define DEBUG_TYPE "frame-info" +static cl::opt EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -1243,9 +1249,51 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + + if (SpillVGPRToAGPR) { + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + Register VReg = + TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, + TRI->isAGPR(MRI, VReg))) { + // FIXME: change to enterBasicBlockEnd() + RS->enterBasicBlock(MBB); + TRI->eliminateFrameIndex(MI, 0, FIOp, RS); + continue; + } + } + } + } + + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } + } + FuncInfo->removeDeadFrameIndices(MFI); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); Index: llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -39,12 +39,6 @@ namespace { -static cl::opt EnableSpillVGPRToAGPR( - "amdgpu-spill-vgpr-to-agpr", - cl::desc("Enable spilling VGPRs to AGPRs"), - cl::ReallyHidden, - cl::init(true)); - class SILowerSGPRSpills : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; @@ -79,6 +73,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -300,6 +295,7 @@ TRI = &TII->getRegisterInfo(); VRM = getAnalysisIfAvailable(); + LIS = getAnalysisIfAvailable(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -317,21 +313,15 @@ MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() - && EnableSpillVGPRToAGPR; bool MadeChange = false; - - const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); - std::unique_ptr RS; - bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs()); - if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) { + if (HasSGPRSpillToVGPR) { // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs // are spilled to VGPRs, in which case we can eliminate the stack usage. // @@ -346,27 +336,6 @@ MachineInstr &MI = *I; Next = std::next(I); - if (SpillToAGPR && TII->isVGPRSpill(MI)) { - // Try to eliminate stack used by VGPR spills before frame - // finalization. - unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::vaddr); - int FI = MI.getOperand(FIOp).getIndex(); - Register VReg = - TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); - if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, - TRI->isAGPR(MRI, VReg))) { - NewReservedRegs = true; - if (!RS) - RS.reset(new RegScavenger()); - - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); - TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get()); - continue; - } - } - if (!TII->isSGPRSpill(MI)) continue; @@ -374,23 +343,18 @@ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, + nullptr, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); } } } + // FIXME: Adding to live-ins redundant with reserving registers. for (MachineBasicBlock &MBB : MF) { for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) MBB.addLiveIn(SSpill.VGPR); - - for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) - MBB.addLiveIn(Reg); - - for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); } Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -14,11 +14,13 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include #include @@ -323,6 +325,13 @@ // partially spill the SGPR to VGPRs. SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + +#if 0 + DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), + "VGPRs for SGPR spilling", + 0, DS_Error); + MF.getFunction().getContext().diagnose(DiagOutOfRegs); +#endif return false; } Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -115,12 +115,23 @@ bool IsLoad) const; /// If \p OnlyToVGPR is true, this will only succeed if this + bool spillSGPRImpl(MachineBasicBlock::iterator MI, + const DebugLoc &DL, + Register Reg, + bool IsKill, + int Index, + RegScavenger *RS, + LiveIntervals *LIS = nullptr, + bool OnlyToVGPR = false) const; + bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, @@ -128,7 +139,8 @@ RegScavenger *RS) const override; bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr) const; StringRef getRegAsmName(MCRegister Reg) const override; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -333,6 +333,17 @@ reserveRegisterTuples(Reserved, Reg); } + // Reserve VGPRs used for SGPR spilling. + // Note we treat freezeReservedRegs unusually because we run register + // allocation in two phases. It's OK to re-freeze with new registers for the + // second run. +#if 0 + for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { + for (auto &SpilledVGPR : SpilledFI.second) + reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); + } +#endif + // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) reserveRegisterTuples(Reserved, Reg); @@ -1143,10 +1154,14 @@ } } -bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, - int Index, - RegScavenger *RS, - bool OnlyToVGPR) const { +bool SIRegisterInfo::spillSGPRImpl(MachineBasicBlock::iterator MI, + const DebugLoc &DL, + Register SuperReg, + bool IsKill, + int Index, + RegScavenger *RS, + LiveIntervals *LIS, + bool OnlyToVGPR) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); @@ -1159,10 +1174,6 @@ const SIInstrInfo *TII = ST.getInstrInfo(); - Register SuperReg = MI->getOperand(0).getReg(); - bool IsKill = MI->getOperand(0).isKill(); - const DebugLoc &DL = MI->getDebugLoc(); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg())); @@ -1193,6 +1204,13 @@ .addImm(Spill.Lane) .addReg(Spill.VGPR); + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } + if (i == 0 && NumSubRegs > 1) { // We may be spilling a super-register which is only partially defined, // and need to ensure later spills think the value is defined. @@ -1256,12 +1274,31 @@ MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SuperReg); + return true; } +bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, + int Index, + RegScavenger *RS, + LiveIntervals *LIS, + bool OnlyToVGPR) const { + + Register SuperReg = MI->getOperand(0).getReg(); + bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + return spillSGPRImpl(MI, DL, SuperReg, IsKill, Index, + RS, LIS, OnlyToVGPR); +} + bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); MachineBasicBlock *MBB = MI->getParent(); @@ -1301,8 +1338,16 @@ .addImm(Spill.Lane); if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); + + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } } } else { + assert(!LIS); Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); RS->setRegUsed(TmpVGPR); @@ -1330,11 +1375,22 @@ .addImm(i); if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); + + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } } } } MI->eraseFromParent(); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SuperReg); + return true; } @@ -1344,7 +1400,8 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, - RegScavenger *RS) const { + RegScavenger *RS, + LiveIntervals *LIS) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -1355,7 +1412,7 @@ case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, LIS, true); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -1365,7 +1422,7 @@ case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, true); + return restoreSGPR(MI, FI, RS, LIS, true); default: llvm_unreachable("not an SGPR spill instruction"); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,43 +8,43 @@ ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 ; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v0, v11 +; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v1, v12, vcc ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v12, s5 ; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 -; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc -; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[15:16], off offset:48 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v0, v11 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v1, v12, vcc +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[19:22], v[31:32], off ; GCN-NEXT: global_load_dwordx4 v[23:26], v[31:32], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[27:30], v[31:32], off offset:32 @@ -52,235 +52,238 @@ ; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off ; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GCN-NEXT: v_add_u32_e32 v1, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v63, 6, s33 +; GCN-NEXT: v_add_u32_e32 v63, 0x100, v63 +; GCN-NEXT: v_add_u32_e32 v0, 20, v63 +; GCN-NEXT: v_add_u32_e32 v2, 16, v63 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 20, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 28, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 36, v0 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 52, v0 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 60, v0 -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 -; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 -; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 -; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 24, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 28, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 32, v63 +; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 36, v63 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 40, v63 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 44, v63 +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 48, v63 +; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 52, v63 +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 56, v63 +; GCN-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 60, v63 +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 64, v63 +; GCN-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x44, v63 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x48, v63 +; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x4c, v63 +; GCN-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x50, v63 +; GCN-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x54, v63 +; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x58, v63 +; GCN-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x5c, v63 +; GCN-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x60, v63 +; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x64, v63 +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x68, v63 +; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x6c, v63 +; GCN-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x70, v63 +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x74, v63 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x78, v63 +; GCN-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x7c, v63 +; GCN-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x80, v63 +; GCN-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x84, v63 +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x88, v63 +; GCN-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x8c, v63 +; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x90, v63 +; GCN-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x94, v63 +; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x98, v63 +; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x9c, v63 +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xa0, v63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa4, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 -; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa8, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 +; GCN-NEXT: v_add_u32_e32 v0, 0xac, v63 +; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb0, v63 +; GCN-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb4, v63 +; GCN-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb8, v63 +; GCN-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xbc, v63 +; GCN-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xc0, v63 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 8, v0 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 12, v0 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xc4, v63 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xc8, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xcc, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 4, v63 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 8, v63 +; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 12, v63 +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v0 -; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v0 -; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xec, v0 -; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v0 -; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 -; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v0 -; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 63, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: v_add_u32_e32 v0, 0xd0, v63 +; GCN-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xd4, v63 +; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xd8, v63 +; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xdc, v63 +; GCN-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe0, v63 +; GCN-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe4, v63 +; GCN-NEXT: buffer_store_dword v56, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe8, v63 +; GCN-NEXT: buffer_store_dword v57, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xec, v63 +; GCN-NEXT: buffer_store_dword v58, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xf0, v63 +; GCN-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xf4, v63 +; GCN-NEXT: buffer_store_dword v60, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xf8, v63 +; GCN-NEXT: buffer_store_dword v61, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xfc, v63 +; GCN-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 63, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v0, v63, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -293,43 +296,43 @@ ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 ; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v0, v11 +; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v1, v12, vcc ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v12, s5 ; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 -; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc -; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[15:16], off offset:48 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v0, v11 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v1, v12, vcc +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[19:22], v[31:32], off ; GCN-NEXT: global_load_dwordx4 v[23:26], v[31:32], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[27:30], v[31:32], off offset:32 @@ -337,240 +340,243 @@ ; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off ; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GCN-NEXT: v_add_u32_e32 v1, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v63, 6, s33 +; GCN-NEXT: v_add_u32_e32 v63, 0x100, v63 +; GCN-NEXT: v_add_u32_e32 v0, 20, v63 +; GCN-NEXT: v_add_u32_e32 v2, 16, v63 +; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v63 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 20, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 28, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 36, v0 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 52, v0 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 60, v0 -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 -; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 -; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 -; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 24, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 28, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 32, v63 +; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 36, v63 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 40, v63 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 44, v63 +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 48, v63 +; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 52, v63 +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 56, v63 +; GCN-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 60, v63 +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 64, v63 +; GCN-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x44, v63 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x48, v63 +; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x4c, v63 +; GCN-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x50, v63 +; GCN-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x54, v63 +; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x58, v63 +; GCN-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x5c, v63 +; GCN-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x60, v63 +; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x64, v63 +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x68, v63 +; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x6c, v63 +; GCN-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x70, v63 +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x74, v63 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x78, v63 +; GCN-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x7c, v63 +; GCN-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x80, v63 +; GCN-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x84, v63 +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x88, v63 +; GCN-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x8c, v63 +; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x90, v63 +; GCN-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x94, v63 +; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x98, v63 +; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x9c, v63 +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xa0, v63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa4, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 -; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa8, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 +; GCN-NEXT: v_add_u32_e32 v0, 0xac, v63 +; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb0, v63 +; GCN-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb4, v63 +; GCN-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb8, v63 +; GCN-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xbc, v63 +; GCN-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xc0, v63 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 8, v0 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 12, v0 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xc4, v63 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xc8, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xcc, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 4, v63 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 8, v63 +; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 12, v63 +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v3, 0xd0, v0 -; GCN-NEXT: buffer_store_dword v51, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 -; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v54, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xe4, v0 -; GCN-NEXT: buffer_store_dword v56, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xec, v0 -; GCN-NEXT: buffer_store_dword v58, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xf0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GCN-NEXT: buffer_store_dword v59, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xf4, v0 -; GCN-NEXT: v_and_b32_e32 v1, 63, v1 -; GCN-NEXT: buffer_store_dword v60, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 -; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xfc, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GCN-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_dword v62, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v63 +; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v63 +; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v63 +; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v63 +; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v63 +; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v63 +; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xec, v63 +; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v63 +; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v63 +; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v63 +; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v63 +; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GCN-NEXT: v_and_b32_e32 v0, 63, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v0, v63, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: v_and_b32_e32 v1, 1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: s_waitcnt vmcnt(16) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -583,43 +589,43 @@ ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 ; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v0, v11 +; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v1, v12, vcc ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v12, s5 ; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 -; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc -; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[15:16], off offset:48 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v0, v11 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v1, v12, vcc +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[19:22], v[31:32], off ; GCN-NEXT: global_load_dwordx4 v[23:26], v[31:32], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[27:30], v[31:32], off offset:32 @@ -627,237 +633,240 @@ ; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off ; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GCN-NEXT: v_add_u32_e32 v1, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v63, 6, s33 +; GCN-NEXT: v_add_u32_e32 v63, 0x100, v63 +; GCN-NEXT: v_add_u32_e32 v0, 20, v63 +; GCN-NEXT: v_add_u32_e32 v2, 16, v63 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 20, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 28, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 36, v0 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 52, v0 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 60, v0 -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 -; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 -; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 -; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 24, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 28, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 32, v63 +; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 36, v63 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 40, v63 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 44, v63 +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 48, v63 +; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 52, v63 +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 56, v63 +; GCN-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 60, v63 +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 64, v63 +; GCN-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x44, v63 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x48, v63 +; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x4c, v63 +; GCN-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x50, v63 +; GCN-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x54, v63 +; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x58, v63 +; GCN-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x5c, v63 +; GCN-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x60, v63 +; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x64, v63 +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x68, v63 +; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x6c, v63 +; GCN-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x70, v63 +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x74, v63 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x78, v63 +; GCN-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x7c, v63 +; GCN-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x80, v63 +; GCN-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x84, v63 +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x88, v63 +; GCN-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x8c, v63 +; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x90, v63 +; GCN-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x94, v63 +; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x98, v63 +; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x9c, v63 +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xa0, v63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa4, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 -; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa8, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 +; GCN-NEXT: v_add_u32_e32 v0, 0xac, v63 +; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb0, v63 +; GCN-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb4, v63 +; GCN-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb8, v63 +; GCN-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xbc, v63 +; GCN-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xc0, v63 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 8, v0 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 12, v0 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xc4, v63 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xc8, v63 +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xcc, v63 +; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 4, v63 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 8, v63 +; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 12, v63 +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v0 -; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v0 -; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xec, v0 -; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v0 -; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 -; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v0 -; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 31, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: v_add_u32_e32 v0, 0xd0, v63 +; GCN-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xd4, v63 +; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xd8, v63 +; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xdc, v63 +; GCN-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe0, v63 +; GCN-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe4, v63 +; GCN-NEXT: buffer_store_dword v56, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe8, v63 +; GCN-NEXT: buffer_store_dword v57, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xec, v63 +; GCN-NEXT: buffer_store_dword v58, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xf0, v63 +; GCN-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xf4, v63 +; GCN-NEXT: buffer_store_dword v60, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xf8, v63 +; GCN-NEXT: buffer_store_dword v61, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xfc, v63 +; GCN-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 31, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_add_u32_e32 v0, v63, v0 ; GCN-NEXT: v_add_u32_e32 v1, 4, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -979,8 +979,8 @@ ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: v_mov_b32_e32 v16, s15 ; MOVREL-NEXT: v_mov_b32_e32 v2, s1 -; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_mov_b32_e32 v15, s14 ; MOVREL-NEXT: v_mov_b32_e32 v14, s13 ; MOVREL-NEXT: v_mov_b32_e32 v13, s12 @@ -995,30 +995,28 @@ ; MOVREL-NEXT: v_mov_b32_e32 v4, s3 ; MOVREL-NEXT: v_mov_b32_e32 v3, s2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; MOVREL-NEXT: s_mov_b32 s30, s18 -; MOVREL-NEXT: s_mov_b32 s31, s19 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s31, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s31, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s30, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s31, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s31, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s31, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s30, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s31, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s30, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s31, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4 -; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s19, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s19, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s19, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s18, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s19, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s18, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s19, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s18, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s19, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -29,7 +29,7 @@ ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -45,7 +45,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -83,7 +83,7 @@ ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -102,7 +102,7 @@ ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -140,7 +140,7 @@ ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -159,7 +159,7 @@ ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -195,7 +195,7 @@ ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -211,7 +211,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -249,7 +249,7 @@ ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -268,7 +268,7 @@ ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -306,7 +306,7 @@ ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff @@ -325,7 +325,7 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -361,7 +361,7 @@ ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -377,7 +377,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -413,7 +413,7 @@ ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -429,7 +429,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -467,7 +467,7 @@ ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff @@ -486,7 +486,7 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -524,7 +524,7 @@ ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff @@ -543,7 +543,7 @@ ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -7097,25 +7097,23 @@ ; ; GFX10-LABEL: s_saddsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s28, s0, s8 +; GFX10-NEXT: s_add_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 -; GFX10-NEXT: s_addc_u32 s29, s1, s9 +; GFX10-NEXT: s_addc_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_addc_u32 s30, s2, s10 +; GFX10-NEXT: s_addc_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_addc_u32 s31, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] -; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] +; GFX10-NEXT: s_addc_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 @@ -7136,27 +7134,27 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[16:17], s20 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[18:19], s22 ; GFX10-NEXT: s_and_b32 s24, s10, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 +; GFX10-NEXT: s_ashr_i32 s2, s19, 31 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s20 +; GFX10-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_and_b32 s10, s23, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 @@ -7165,7 +7163,7 @@ ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -7173,7 +7171,7 @@ ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_addc_u32 s3, s3, s23 ; GFX10-NEXT: s_add_u32 s0, s4, s12 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -7083,25 +7083,23 @@ ; ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s28, s0, s8 +; GFX10-NEXT: s_sub_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 -; GFX10-NEXT: s_subb_u32 s29, s1, s9 +; GFX10-NEXT: s_subb_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_subb_u32 s30, s2, s10 +; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_subb_u32 s31, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] -; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] +; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 @@ -7122,27 +7120,27 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[16:17], s20 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[18:19], s22 ; GFX10-NEXT: s_and_b32 s24, s10, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 +; GFX10-NEXT: s_ashr_i32 s2, s19, 31 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s20 +; GFX10-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_and_b32 s10, s23, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 @@ -7151,7 +7149,7 @@ ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -7159,7 +7157,7 @@ ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_addc_u32 s3, s3, s23 ; GFX10-NEXT: s_sub_u32 s0, s4, s12 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -3025,7 +3025,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s10, s10, 1 ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: s_subb_u32 s14, s2, s6 +; GFX10-NEXT: s_subb_u32 s10, s2, s6 ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10-NEXT: s_and_b32 s11, s11, 1 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 @@ -3040,7 +3040,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s14, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -3681,7 +3681,7 @@ ; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13] -; GFX10-NEXT: s_subb_u32 s30, s6, s14 +; GFX10-NEXT: s_subb_u32 s10, s6, s14 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 @@ -3707,7 +3707,7 @@ ; GFX10-NEXT: v_readfirstlane_b32 s2, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s30, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s3, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -123,7 +123,7 @@ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s13, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX1064-NEXT: s_cbranch_execz BB0_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] @@ -131,7 +131,7 @@ ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1064-NEXT: BB0_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -354,14 +354,14 @@ ; GFX1064-NEXT: s_mov_b64 exec, s[10:11] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: v_mov_b32_e32 v0, s12 ; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1064-NEXT: BB1_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 Index: llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -1,10 +1,10 @@ ; -enable-misched=false makes the register usage more predictable ; -regalloc=fast just makes the test run faster -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 define internal void @use256vgprs() { %v0 = call i32 asm sideeffect "; def $0", "=v"() Index: llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -260,22 +260,22 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt -; GCN-NEXT: v_writelane_b32 v1, s33, 63 -; GCN-COUNT-60: v_writelane_b32 v1 +; GCN-NEXT: v_writelane_b32 v0, s33, 63 +; GCN-COUNT-60: v_writelane_b32 v0 ; GCN: s_mov_b32 s33, s32 -; GCN-COUNT-2: v_writelane_b32 v1 +; GCN-COUNT-2: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1 +; GCN: v_writelane_b32 v0 ; MUBUF: s_add_u32 s32, s32, 0x300 ; MUBUF: s_sub_u32 s32, s32, 0x300 ; FLATSCR: s_add_u32 s32, s32, 12 ; FLATSCR: s_sub_u32 s32, s32, 12 -; GCN-NEXT: v_readlane_b32 s33, v1, 63 +; GCN-NEXT: v_readlane_b32 s33, v0, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { @@ -297,21 +297,21 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt -; GCN-COUNT-62: v_writelane_b32 v1, +; GCN-COUNT-62: v_writelane_b32 v0, ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_writelane_b32 v1, +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1, +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_add_u32 s32, s32, 0x300 ; FLATSCR: s_add_u32 s32, s32, 12 -; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 +; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] @@ -359,23 +359,23 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt -; GCN-NEXT: v_writelane_b32 v1, s33, 2 -; GCN-NEXT: v_writelane_b32 v1, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s33, 2 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN: v_writelane_b32 v1, s31, 1 +; GCN: v_writelane_b32 v0, s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN: ;;#ASMSTART -; MUBUF: v_readlane_b32 s4, v1, 0 +; MUBUF: v_readlane_b32 s4, v0, 0 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 -; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 -; FLATSCR: v_readlane_b32 s0, v1, 0 +; MUBUF-NEXT: v_readlane_b32 s5, v0, 1 +; FLATSCR: v_readlane_b32 s0, v0, 0 ; FLATSCR-NEXT: s_add_u32 s32, s32, 8 -; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s1, v0, 1 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 -; GCN-NEXT: v_readlane_b32 s33, v1, 2 +; GCN-NEXT: v_readlane_b32 s33, v0, 2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[4:5] ; FLATSCR-NEXT: s_setpc_b64 s[0:1] @@ -632,6 +632,67 @@ ret void } +; We can use a non-csr in a leaf function. + +; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls_low_regs: +; GCN-NOT: buffer_store_dword +; GCN: v_writelane_b32 v8, +; GCN: v_readlane_b32 s{{[0-9]+}}, v8 +; GCN-NOT: buffer_load_dword +define void @callee_func_sgpr_spill_no_calls_low_regs(i32 %in) #0 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0 + ret void +} + +; GCN-LABEL: {{^}}callee_func_sgpr_spill_calls_low_regs: +; MUBUF: buffer_store_dword v40 +; MUBUF: buffer_store_dword v41 +; FLATSCR: scratch_store_dword off, v40 +; FLATSCR: scratch_store_dword off, v41 +; GCN: v_writelane_b32 v40, +; GCN: v_writelane_b32 v41, +; GCN: v_readlane_b32 s{{[0-9]+}}, v40 +; GCN: v_readlane_b32 s{{[0-9]+}}, v41 +; MUBUF: buffer_load_dword v40 +; MUBUF: buffer_load_dword v41 +; FLATSCR: scratch_load_dword v40 +; FLATSCR: scratch_load_dword v41 + +define void @callee_func_sgpr_spill_calls_low_regs(i32 %in) #0 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + + call void @external_void_func_void() + + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind "frame-pointer"="all" } attributes #2 = { nounwind "frame-pointer"="non-leaf" } Index: llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2860,31 +2860,31 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v42, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: global_store_dword v[40:41], v0, off -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v42, 0 -; GFX9-NEXT: v_readlane_b32 s5, v42, 1 +; GFX9-NEXT: global_store_dword v[41:42], v0, off +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: v_readlane_b32 s33, v42, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -2894,33 +2894,33 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v42, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_mov_b32_e32 v40, v0 -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v1 -; GFX10-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: global_store_dword v[40:41], v0, off +; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s4, v42, 0 -; GFX10-NEXT: v_readlane_b32 s5, v42, 1 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 -; GFX10-NEXT: v_readlane_b32 s33, v42, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -184,33 +184,33 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v41, s33, 2 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v41, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: v_mov_b32_e32 v31, v41 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 -; GFX9-NEXT: v_readlane_b32 s5, v41, 1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -220,34 +220,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v31 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v31 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v31, v40 +; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 0 -; GFX10-NEXT: v_readlane_b32 s5, v41, 1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 -; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -728,14 +728,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v41, s33, 3 -; GFX9-NEXT: v_writelane_b32 v41, s40, 0 -; GFX9-NEXT: v_writelane_b32 v41, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -745,23 +745,23 @@ ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v41, s31, 2 -; GFX9-NEXT: v_mov_b32_e32 v40, v32 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_mov_b32_e32 v41, v32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v40 +; GFX9-NEXT: ; use v41 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 1 -; GFX9-NEXT: v_readlane_b32 s5, v41, 2 -; GFX9-NEXT: v_readlane_b32 s40, v41, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s40, v40, 0 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: v_readlane_b32 s33, v41, 3 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -771,41 +771,41 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v41, s40, 0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v40, v32 -; GFX10-NEXT: v_writelane_b32 v41, s30, 1 -; GFX10-NEXT: v_writelane_b32 v41, s31, 2 +; GFX10-NEXT: v_mov_b32_e32 v41, v32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v40 +; GFX10-NEXT: ; use v41 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 1 -; GFX10-NEXT: v_readlane_b32 s5, v41, 2 -; GFX10-NEXT: v_readlane_b32 s40, v41, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s40, v40, 0 ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 -; GFX10-NEXT: v_readlane_b32 s33, v41, 3 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -202,32 +202,32 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -235,13 +235,13 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -250,36 +250,36 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB2_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -292,32 +292,32 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -325,13 +325,13 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] @@ -341,36 +341,36 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB3_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -383,32 +383,32 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -416,13 +416,13 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -431,37 +431,37 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB4_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -475,32 +475,32 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 19 +; GCN-NEXT: v_writelane_b32 v40, s33, 19 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s50, 15 -; GCN-NEXT: v_writelane_b32 v43, s51, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s50, 15 +; GCN-NEXT: v_writelane_b32 v40, s51, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -508,20 +508,20 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_cbranch_execz BB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: v_writelane_b32 v43, s30, 17 -; GCN-NEXT: v_writelane_b32 v43, s31, 18 +; GCN-NEXT: v_writelane_b32 v40, s30, 17 +; GCN-NEXT: v_writelane_b32 v40, s31, 18 ; GCN-NEXT: s_mov_b64 s[48:49], exec ; GCN-NEXT: BB5_2: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -530,40 +530,40 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] ; GCN-NEXT: s_cbranch_execnz BB5_2 ; GCN-NEXT: ; %bb.3: ; GCN-NEXT: s_mov_b64 exec, s[48:49] -; GCN-NEXT: v_readlane_b32 s30, v43, 17 -; GCN-NEXT: v_readlane_b32 s31, v43, 18 +; GCN-NEXT: v_readlane_b32 s30, v40, 17 +; GCN-NEXT: v_readlane_b32 s31, v40, 18 ; GCN-NEXT: BB5_4: ; %bb2 ; GCN-NEXT: s_or_b64 exec, exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s51, v43, 16 -; GCN-NEXT: v_readlane_b32 s50, v43, 15 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s51, v40, 16 +; GCN-NEXT: v_readlane_b32 s50, v40, 15 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 19 +; GCN-NEXT: v_readlane_b32 s33, v40, 19 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -583,26 +583,26 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v42, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 6 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v42, s34, 0 -; GCN-NEXT: v_writelane_b32 v42, s35, 1 -; GCN-NEXT: v_writelane_b32 v42, s36, 2 -; GCN-NEXT: v_writelane_b32 v42, s37, 3 -; GCN-NEXT: v_writelane_b32 v42, s30, 4 -; GCN-NEXT: v_writelane_b32 v42, s31, 5 -; GCN-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s30, 4 +; GCN-NEXT: v_writelane_b32 v40, s31, 5 +; GCN-NEXT: v_mov_b32_e32 v42, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[34:35], exec ; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v40 -; GCN-NEXT: v_readfirstlane_b32 s7, v41 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41] +; GCN-NEXT: v_readfirstlane_b32 s6, v41 +; GCN-NEXT: v_readfirstlane_b32 s7, v42 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[41:42] ; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc ; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] @@ -610,18 +610,18 @@ ; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_readlane_b32 s4, v42, 4 -; GCN-NEXT: v_readlane_b32 s5, v42, 5 -; GCN-NEXT: v_readlane_b32 s37, v42, 3 -; GCN-NEXT: v_readlane_b32 s36, v42, 2 -; GCN-NEXT: v_readlane_b32 s35, v42, 1 -; GCN-NEXT: v_readlane_b32 s34, v42, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v42, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 6 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -96,13 +96,13 @@ ; ; GFX10-LABEL: image_sample_2d_f16_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, v5 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -79,7 +79,7 @@ ; ; GFX10-LABEL: sample_1d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] @@ -92,7 +92,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] @@ -499,7 +499,7 @@ ; ; GFX10-LABEL: sample_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] @@ -512,7 +512,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -187,44 +187,44 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: v_writelane_b32 v43, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 -; GFX9-NEXT: v_writelane_b32 v43, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 +; GFX9-NEXT: v_mov_b32_e32 v0, v41 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 +; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v43, 2 -; GFX9-NEXT: v_readlane_b32 s5, v43, 3 -; GFX9-NEXT: v_readlane_b32 s35, v43, 1 -; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 3 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s33, v43, 4 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] Index: llvm/test/CodeGen/AMDGPU/pei-build-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-build-spill.mir +++ llvm/test/CodeGen/AMDGPU/pei-build-spill.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s +# RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s +# RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s --- Index: llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -0,0 +1,110 @@ +; REQUIRES: asserts + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s +; RUN: llc -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s + +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s + +; RUN: llc -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s +; RUN: llc -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s + +; RUN: not --crash llc -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s +; RUN: not --crash llc -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s + + +; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc + +; DEFAULT: Greedy Register Allocator +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: SI lower SGPR spill instructions +; DEFAULT-NEXT: Virtual Register Map +; DEFAULT-NEXT: Live Register Matrix +; DEFAULT-NEXT: Machine Optimization Remark Emitter +; DEFAULT-NEXT: Greedy Register Allocator +; DEFAULT-NEXT: GCN NSA Reassign +; DEFAULT-NEXT: GCN RegBank Reassign +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: Stack Slot Coloring + +; O0: Fast Register Allocator +; O0-NEXT: SI lower SGPR spill instructions +; O0-NEXT: Fast Register Allocator +; O0-NEXT: SI Fix VGPR copies + + + + +; BASIC-DEFAULT: Debug Variable Analysis +; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis +; BASIC-DEFAULT-NEXT: Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Basic Register Allocator +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges +; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis +; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter +; BASIC-DEFAULT-NEXT: Greedy Register Allocator +; BASIC-DEFAULT-NEXT: GCN NSA Reassign +; BASIC-DEFAULT-NEXT: GCN RegBank Reassign +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: Stack Slot Coloring + + + +; DEFAULT-BASIC: Greedy Register Allocator +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions +; DEFAULT-BASIC-NEXT: Virtual Register Map +; DEFAULT-BASIC-NEXT: Live Register Matrix +; DEFAULT-BASIC-NEXT: Basic Register Allocator +; DEFAULT-BASIC-NEXT: GCN NSA Reassign +; DEFAULT-BASIC-NEXT: GCN RegBank Reassign +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: Stack Slot Coloring + + + +; BASIC-BASIC: Debug Variable Analysis +; BASIC-BASIC-NEXT: Live Stack Slot Analysis +; BASIC-BASIC-NEXT: Machine Block Frequency Analysis +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: SI lower SGPR spill instructions +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: GCN NSA Reassign +; BASIC-BASIC-NEXT: GCN RegBank Reassign +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: Stack Slot Coloring + + +declare void @bar() + +; Something with some CSR SGPR spills +define void @foo() { + call void asm sideeffect "; clobber", "~{s33}"() + call void @bar() + ret void +} + +; Block live out spills with fast regalloc +define amdgpu_kernel void @control_flow(i1 %cond) { + %s33 = call i32 asm sideeffect "; clobber", "={s33}"() + br i1 %cond, label %bb0, label %bb1 + +bb0: + call void asm sideeffect "; use %0", "s"(i32 %s33) + br label %bb1 + +bb1: + ret void +} Index: llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; The first 64 SGPR spills can go to a VGPR, but there isn't a second +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. + +define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { +; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 0 +; GCN-NEXT: v_writelane_b32 v23, s9, 1 +; GCN-NEXT: v_writelane_b32 v23, s10, 2 +; GCN-NEXT: v_writelane_b32 v23, s11, 3 +; GCN-NEXT: v_writelane_b32 v23, s12, 4 +; GCN-NEXT: v_writelane_b32 v23, s13, 5 +; GCN-NEXT: v_writelane_b32 v23, s14, 6 +; GCN-NEXT: v_writelane_b32 v23, s15, 7 +; GCN-NEXT: v_writelane_b32 v23, s16, 8 +; GCN-NEXT: v_writelane_b32 v23, s17, 9 +; GCN-NEXT: v_writelane_b32 v23, s18, 10 +; GCN-NEXT: v_writelane_b32 v23, s19, 11 +; GCN-NEXT: v_writelane_b32 v23, s20, 12 +; GCN-NEXT: v_writelane_b32 v23, s21, 13 +; GCN-NEXT: v_writelane_b32 v23, s22, 14 +; GCN-NEXT: v_writelane_b32 v23, s23, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 16 +; GCN-NEXT: v_writelane_b32 v23, s9, 17 +; GCN-NEXT: v_writelane_b32 v23, s10, 18 +; GCN-NEXT: v_writelane_b32 v23, s11, 19 +; GCN-NEXT: v_writelane_b32 v23, s12, 20 +; GCN-NEXT: v_writelane_b32 v23, s13, 21 +; GCN-NEXT: v_writelane_b32 v23, s14, 22 +; GCN-NEXT: v_writelane_b32 v23, s15, 23 +; GCN-NEXT: v_writelane_b32 v23, s16, 24 +; GCN-NEXT: v_writelane_b32 v23, s17, 25 +; GCN-NEXT: v_writelane_b32 v23, s18, 26 +; GCN-NEXT: v_writelane_b32 v23, s19, 27 +; GCN-NEXT: v_writelane_b32 v23, s20, 28 +; GCN-NEXT: v_writelane_b32 v23, s21, 29 +; GCN-NEXT: v_writelane_b32 v23, s22, 30 +; GCN-NEXT: v_writelane_b32 v23, s23, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 32 +; GCN-NEXT: v_writelane_b32 v23, s9, 33 +; GCN-NEXT: v_writelane_b32 v23, s10, 34 +; GCN-NEXT: v_writelane_b32 v23, s11, 35 +; GCN-NEXT: v_writelane_b32 v23, s12, 36 +; GCN-NEXT: v_writelane_b32 v23, s13, 37 +; GCN-NEXT: v_writelane_b32 v23, s14, 38 +; GCN-NEXT: v_writelane_b32 v23, s15, 39 +; GCN-NEXT: v_writelane_b32 v23, s16, 40 +; GCN-NEXT: v_writelane_b32 v23, s17, 41 +; GCN-NEXT: v_writelane_b32 v23, s18, 42 +; GCN-NEXT: v_writelane_b32 v23, s19, 43 +; GCN-NEXT: v_writelane_b32 v23, s20, 44 +; GCN-NEXT: v_writelane_b32 v23, s21, 45 +; GCN-NEXT: v_writelane_b32 v23, s22, 46 +; GCN-NEXT: v_writelane_b32 v23, s23, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 48 +; GCN-NEXT: v_writelane_b32 v23, s9, 49 +; GCN-NEXT: v_writelane_b32 v23, s10, 50 +; GCN-NEXT: v_writelane_b32 v23, s11, 51 +; GCN-NEXT: v_writelane_b32 v23, s12, 52 +; GCN-NEXT: v_writelane_b32 v23, s13, 53 +; GCN-NEXT: v_writelane_b32 v23, s14, 54 +; GCN-NEXT: v_writelane_b32 v23, s15, 55 +; GCN-NEXT: v_writelane_b32 v23, s16, 56 +; GCN-NEXT: v_writelane_b32 v23, s17, 57 +; GCN-NEXT: v_writelane_b32 v23, s18, 58 +; GCN-NEXT: v_writelane_b32 v23, s19, 59 +; GCN-NEXT: v_writelane_b32 v23, s20, 60 +; GCN-NEXT: v_writelane_b32 v23, s21, 61 +; GCN-NEXT: v_writelane_b32 v23, s22, 62 +; GCN-NEXT: v_writelane_b32 v23, s23, 63 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[6:7] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s6, 0 +; GCN-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s4, s5 +; GCN-NEXT: s_cbranch_scc1 BB0_2 +; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: v_readlane_b32 s4, v23, 0 +; GCN-NEXT: v_readlane_b32 s5, v23, 1 +; GCN-NEXT: v_readlane_b32 s6, v23, 2 +; GCN-NEXT: v_readlane_b32 s7, v23, 3 +; GCN-NEXT: v_readlane_b32 s8, v23, 4 +; GCN-NEXT: v_readlane_b32 s9, v23, 5 +; GCN-NEXT: v_readlane_b32 s10, v23, 6 +; GCN-NEXT: v_readlane_b32 s11, v23, 7 +; GCN-NEXT: v_readlane_b32 s12, v23, 8 +; GCN-NEXT: v_readlane_b32 s13, v23, 9 +; GCN-NEXT: v_readlane_b32 s14, v23, 10 +; GCN-NEXT: v_readlane_b32 s15, v23, 11 +; GCN-NEXT: v_readlane_b32 s16, v23, 12 +; GCN-NEXT: v_readlane_b32 s17, v23, 13 +; GCN-NEXT: v_readlane_b32 s18, v23, 14 +; GCN-NEXT: v_readlane_b32 s19, v23, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v23, 16 +; GCN-NEXT: v_readlane_b32 s5, v23, 17 +; GCN-NEXT: v_readlane_b32 s6, v23, 18 +; GCN-NEXT: v_readlane_b32 s7, v23, 19 +; GCN-NEXT: v_readlane_b32 s8, v23, 20 +; GCN-NEXT: v_readlane_b32 s9, v23, 21 +; GCN-NEXT: v_readlane_b32 s10, v23, 22 +; GCN-NEXT: v_readlane_b32 s11, v23, 23 +; GCN-NEXT: v_readlane_b32 s12, v23, 24 +; GCN-NEXT: v_readlane_b32 s13, v23, 25 +; GCN-NEXT: v_readlane_b32 s14, v23, 26 +; GCN-NEXT: v_readlane_b32 s15, v23, 27 +; GCN-NEXT: v_readlane_b32 s16, v23, 28 +; GCN-NEXT: v_readlane_b32 s17, v23, 29 +; GCN-NEXT: v_readlane_b32 s18, v23, 30 +; GCN-NEXT: v_readlane_b32 s19, v23, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v23, 32 +; GCN-NEXT: v_readlane_b32 s5, v23, 33 +; GCN-NEXT: v_readlane_b32 s6, v23, 34 +; GCN-NEXT: v_readlane_b32 s7, v23, 35 +; GCN-NEXT: v_readlane_b32 s8, v23, 36 +; GCN-NEXT: v_readlane_b32 s9, v23, 37 +; GCN-NEXT: v_readlane_b32 s10, v23, 38 +; GCN-NEXT: v_readlane_b32 s11, v23, 39 +; GCN-NEXT: v_readlane_b32 s12, v23, 40 +; GCN-NEXT: v_readlane_b32 s13, v23, 41 +; GCN-NEXT: v_readlane_b32 s14, v23, 42 +; GCN-NEXT: v_readlane_b32 s15, v23, 43 +; GCN-NEXT: v_readlane_b32 s16, v23, 44 +; GCN-NEXT: v_readlane_b32 s17, v23, 45 +; GCN-NEXT: v_readlane_b32 s18, v23, 46 +; GCN-NEXT: v_readlane_b32 s19, v23, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s8, v23, 48 +; GCN-NEXT: v_readlane_b32 s9, v23, 49 +; GCN-NEXT: v_readlane_b32 s10, v23, 50 +; GCN-NEXT: v_readlane_b32 s11, v23, 51 +; GCN-NEXT: v_readlane_b32 s12, v23, 52 +; GCN-NEXT: v_readlane_b32 s13, v23, 53 +; GCN-NEXT: v_readlane_b32 s14, v23, 54 +; GCN-NEXT: v_readlane_b32 s15, v23, 55 +; GCN-NEXT: v_readlane_b32 s16, v23, 56 +; GCN-NEXT: v_readlane_b32 s17, v23, 57 +; GCN-NEXT: v_readlane_b32 s18, v23, 58 +; GCN-NEXT: v_readlane_b32 s19, v23, 59 +; GCN-NEXT: v_readlane_b32 s20, v23, 60 +; GCN-NEXT: v_readlane_b32 s21, v23, 61 +; GCN-NEXT: v_readlane_b32 s22, v23, 62 +; GCN-NEXT: v_readlane_b32 s23, v23, 63 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:5] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: BB0_2: ; %ret +; GCN-NEXT: s_endpgm + call void asm sideeffect "", "~{v[0:7]}" () #0 + call void asm sideeffect "", "~{v[8:15]}" () #0 + call void asm sideeffect "", "~{v[16:19]}"() #0 + call void asm sideeffect "", "~{v[20:21]}"() #0 + call void asm sideeffect "", "~{v22}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } Index: llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -1,5 +1,8 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s + +# -run-pass is used to artifically avoid using split register allocation, which would avoid stressing StackSlotColoring. + # Make sure that stack slot coloring doesn't try to merge frame # indexes used for SGPR spilling with those that aren't. Index: llvm/test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -197,15 +197,15 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v42, s34, 0 -; GCN-DAG: v_writelane_b32 v42, s35, 1 +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 [[CSRV]], s34, 0 +; GCN-DAG: v_writelane_b32 [[CSRV]], s35, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -214,20 +214,20 @@ ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-DAG: v_readlane_b32 s34, v42, 0 -; GCN-DAG: v_readlane_b32 s35, v42, 1 +; GCN-DAG: v_readlane_b32 s34, [[CSRV]], 0 +; GCN-DAG: v_readlane_b32 s35, [[CSRV]], 1 ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { Index: llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-dce-in-ra=0 -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-dce-in-ra=0 -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy,1 -o - %s | FileCheck %s # https://bugs.llvm.org/show_bug.cgi?id=33620 --- Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s -; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX7 %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX7 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s ; Index: llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -22,12 +22,11 @@ } ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 +; CHECK-DAG: v_writelane_b32 v0, s98, 63 +; CHECK-DAG: v_writelane_b32 v1, s99, 0 ; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 - +; CHECK-DAG: v_readlane_b32 s99, v1, 0 +; CHECK-DAG: v_readlane_b32 s98, v0, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca Index: llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -4,17 +4,15 @@ # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}} # CHECK: stack: # CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, -# CHECK-NEXT: stack-id: default, +# CHECK-NEXT: stack-id: sgpr-spill +# CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default -# CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, -# CHECK-NEXT: stack-id: sgpr-spill, - -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) +# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) +# CHECK: $vgpr2 = V_WRITELANE_B32 killed $sgpr5, 0, $vgpr2 +# CHECK: $sgpr5 = V_READLANE_B32 $vgpr2, 0 name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true machineFunctionInfo: Index: llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -8,11 +8,11 @@ ; preserved across the call and should get 8 scratch registers. ; GFX9-LABEL: non_preserved_vgpr_tuple8: -; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -21,28 +21,28 @@ ; GFX9-NEXT: v_mov_b32_e32 v32, v12 ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 +; GFX9: image_gather4_c_b_cl v[41:44], v[32:39], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: non_preserved_vgpr_tuple8: -; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: v_mov_b32_e32 v36, v16 ; GFX10-NEXT: v_mov_b32_e32 v35, v15 @@ -53,7 +53,7 @@ ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10: image_gather4_c_b_cl v[41:44], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 @@ -62,12 +62,12 @@ ; GFX10: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10: buffer_load_dword v43, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX10: s_setpc_b64 s[4:5] main_body: call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 @@ -86,20 +86,20 @@ ; The upper 3 sub-registers are unused. ; GFX9-LABEL: call_preserved_vgpr_tuple8: -; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill - -; GFX9: v_mov_b32_e32 v44, v16 -; GFX9-NEXT: v_mov_b32_e32 v43, v15 -; GFX9-NEXT: v_mov_b32_e32 v42, v14 -; GFX9-NEXT: v_mov_b32_e32 v41, v13 -; GFX9-NEXT: v_mov_b32_e32 v40, v12 - -; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 +; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v60, v16 +; GFX9-NEXT: v_mov_b32_e32 v59, v15 +; GFX9-NEXT: v_mov_b32_e32 v58, v14 +; GFX9-NEXT: v_mov_b32_e32 v57, v13 +; GFX9-NEXT: v_mov_b32_e32 v56, v12 + +; GFX9: image_gather4_c_b_cl v[0:3], v[56:63], s[36:43], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -108,24 +108,24 @@ ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[56:63], s[36:43], s[4:7] dmask:0x1 -; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v60, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: call_preserved_vgpr_tuple8: -; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D @@ -133,24 +133,24 @@ ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v41, v15 -; GFX10-NEXT: v_mov_b32_e32 v42, v14 -; GFX10-NEXT: v_mov_b32_e32 v43, v13 -; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: v_mov_b32_e32 v42, v15 +; GFX10-NEXT: v_mov_b32_e32 v43, v14 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 +; GFX10-NEXT: v_mov_b32_e32 v45, v12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D - -; GFX10: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D + +; GFX10: buffer_load_dword v45, off, s[0:3], s33{{$}} +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 +; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; GFX10: s_setpc_b64 s[4:5] main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck %s # The undef copy of %4 is allocated to $vgpr3, and the identity copy # was deleted, and $vgpr3 was considered undef. The code to replace @@ -31,7 +31,7 @@ ; CHECK-LABEL: name: undef_identity_copy ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc ; CHECK: $sgpr4 = COPY $sgpr95 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @foo, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95