diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -20,6 +20,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" @@ -37,6 +38,7 @@ const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; + MachineDominatorTree *MDT = nullptr; // Save and Restore blocks of the current function. Typically there is a // single save block, unless Windows EH funclets are involved. @@ -50,13 +52,23 @@ void calculateSaveRestoreBlocks(MachineFunction &MF); bool spillCalleeSavedRegs(MachineFunction &MF); + void updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr); bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA) + .set(MachineFunctionProperties::Property::NoVRegs); + } }; } // end anonymous namespace @@ -67,6 +79,7 @@ "SI lower SGPR spill instructions", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -243,12 +256,66 @@ return false; } +void SILowerSGPRSpills::updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr) { + // For the Def of a virtual LaneVPGR to dominate all its uses, we should + // insert an IMPLICIT_DEF before the dominating spill. Switching to a + // depth first order doesn't really help since the machine function can be in + // the unstructured control flow post-SSA. For each virtual register, hence + // finding the common dominator to get either the dominating spill or a block + // dominating all spills. Is there a better way to handle it? + SIMachineFunctionInfo *FuncInfo = + MBB->getParent()->getInfo(); + ArrayRef VGPRSpills = + FuncInfo->getSGPRToVGPRSpills(FI); + Register PrevLaneVGPR; + bool SeenSpillInBlock = false; + for (auto &Spill : VGPRSpills) { + if (PrevLaneVGPR == Spill.VGPR) + continue; + + PrevLaneVGPR = Spill.VGPR; + auto I = LaneVGPRDomInstr.find(Spill.VGPR); + if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { + // Initially add the spill instruction itself for Insertion point. + LaneVGPRDomInstr[Spill.VGPR] = InsertPt; + } else { + assert(I != LaneVGPRDomInstr.end()); + auto PrevInsertPt = I->second; + MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); + if (DomMBB == MBB) { + // The insertion point earlier selected in a predecessor block whose + // spills are currently being lowered. The earlier InsertPt would be + // the one just before the block terminator and it should be changed + // if we insert any new spill in it. Check if they dominate only for + // the first spill in case of tuple spills. + if (!SeenSpillInBlock) { + SeenSpillInBlock = true; + if (MDT->dominates(&*InsertPt, &*PrevInsertPt)) + I->second = InsertPt; + } + continue; + } + + // Find the common dominator block between PrevInsertPt and the + // current spill. + DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB); + if (DomMBB == MBB) + I->second = InsertPt; + else if (DomMBB != PrevInsertPt->getParent()) + I->second = &(*DomMBB->getFirstTerminator()); + } + } +} + bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); LIS = getAnalysisIfAvailable(); + MDT = &getAnalysis(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -258,7 +325,6 @@ bool HasCSRs = spillCalleeSavedRegs(MF); MachineFrameInfo &MFI = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); if (!MFI.hasStackObjects() && !HasCSRs) { @@ -268,7 +334,6 @@ } bool MadeChange = false; - bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -284,6 +349,9 @@ // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + // To track the IMPLICIT_DEF insertion point for the lane vgprs. + DenseMap LaneVGPRDomInstr; + for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { if (!TII->isSGPRSpill(MI)) @@ -291,23 +359,31 @@ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + MachineInstrSpan MIS(&MI, &MBB); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { - NewReservedRegs = true; bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); SpillFIs.set(FI); + updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr); } } } - // FIXME: Adding to live-ins redundant with reserving registers. - for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { + auto InsertPt = LaneVGPRDomInstr[Reg]; + // Insert the IMPLICIT_DEF at the identified points. + auto MIB = + BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Reg); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MIB); + LIS->createAndComputeVirtRegInterval(Reg); + } + } + for (MachineBasicBlock &MBB : MF) { // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -333,9 +409,5 @@ SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any VGPRs added for SGPR spills. - if (NewReservedRegs) - MRI.freezeReservedRegs(MF); - return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -287,33 +287,11 @@ bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, unsigned LaneIndex) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRToVGPRSpills.erase(FI); - - // FIXME: We can run out of free registers with split allocation if - // IPRA is enabled and a called function already uses every VGPR. -#if 0 - DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), - "VGPRs for SGPR spilling", - 0, DS_Error); - MF.getFunction().getContext().diagnose(DiagOutOfRegs); -#endif - return false; - } - + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -711,9 +711,6 @@ for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); - for (auto Reg : MFI->getSGPRSpillVGPRs()) - reserveRegisterTuples(Reserved, Reg); - return Reserved; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -11,6 +11,7 @@ ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: ; implicit-def: $vgpr40 ; CHECK-NEXT: v_writelane_b32 v41, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -246,6 +246,7 @@ ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 +; MUBUF-NEXT: ; implicit-def: $vgpr40 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: v_mov_b32_e32 v0, 11 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 @@ -281,6 +282,7 @@ ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 +; FLATSCR-NEXT: ; implicit-def: $vgpr40 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 @@ -319,11 +321,12 @@ ; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: ; implicit-def: $vgpr40 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_waitcnt vmcnt(1) ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; MUBUF-NEXT: s_waitcnt vmcnt(1) @@ -400,11 +403,12 @@ ; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: ; implicit-def: $vgpr40 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: s_or_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: v_mov_b32_e32 v14, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v2 @@ -38,10 +38,12 @@ ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v16, s4, 0 -; CHECK-NEXT: v_writelane_b32 v16, s5, 1 -; CHECK-NEXT: v_writelane_b32 v16, s6, 2 -; CHECK-NEXT: v_writelane_b32 v16, s7, 3 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: v_writelane_b32 v0, s6, 2 +; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 @@ -49,8 +51,11 @@ ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v16, s4, 4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_writelane_b32 v0, s4, 4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -77,6 +82,7 @@ ; CHECK-NEXT: v_readfirstlane_b32 s6, v2 ; CHECK-NEXT: v_readfirstlane_b32 s5, v1 ; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 ; CHECK-NEXT: s_mov_b32 s14, s9 @@ -85,14 +91,16 @@ ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v16, s12, 5 -; CHECK-NEXT: v_writelane_b32 v16, s13, 6 -; CHECK-NEXT: v_writelane_b32 v16, s14, 7 -; CHECK-NEXT: v_writelane_b32 v16, s15, 8 -; CHECK-NEXT: v_writelane_b32 v16, s16, 9 -; CHECK-NEXT: v_writelane_b32 v16, s17, 10 -; CHECK-NEXT: v_writelane_b32 v16, s18, 11 -; CHECK-NEXT: v_writelane_b32 v16, s19, 12 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_writelane_b32 v0, s12, 5 +; CHECK-NEXT: v_writelane_b32 v0, s13, 6 +; CHECK-NEXT: v_writelane_b32 v0, s14, 7 +; CHECK-NEXT: v_writelane_b32 v0, s15, 8 +; CHECK-NEXT: v_writelane_b32 v0, s16, 9 +; CHECK-NEXT: v_writelane_b32 v0, s17, 10 +; CHECK-NEXT: v_writelane_b32 v0, s18, 11 +; CHECK-NEXT: v_writelane_b32 v0, s19, 12 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v6, v8 ; CHECK-NEXT: v_mov_b32_e32 v7, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v10 @@ -111,25 +119,30 @@ ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v16, s4, 13 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_writelane_b32 v0, s4, 13 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: v_readlane_b32 s4, v16, 13 +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v2, 13 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s8, v16, 5 -; CHECK-NEXT: v_readlane_b32 s9, v16, 6 -; CHECK-NEXT: v_readlane_b32 s10, v16, 7 -; CHECK-NEXT: v_readlane_b32 s11, v16, 8 -; CHECK-NEXT: v_readlane_b32 s12, v16, 9 -; CHECK-NEXT: v_readlane_b32 s13, v16, 10 -; CHECK-NEXT: v_readlane_b32 s14, v16, 11 -; CHECK-NEXT: v_readlane_b32 s15, v16, 12 -; CHECK-NEXT: v_readlane_b32 s16, v16, 0 -; CHECK-NEXT: v_readlane_b32 s17, v16, 1 -; CHECK-NEXT: v_readlane_b32 s18, v16, 2 -; CHECK-NEXT: v_readlane_b32 s19, v16, 3 +; CHECK-NEXT: v_readlane_b32 s8, v2, 5 +; CHECK-NEXT: v_readlane_b32 s9, v2, 6 +; CHECK-NEXT: v_readlane_b32 s10, v2, 7 +; CHECK-NEXT: v_readlane_b32 s11, v2, 8 +; CHECK-NEXT: v_readlane_b32 s12, v2, 9 +; CHECK-NEXT: v_readlane_b32 s13, v2, 10 +; CHECK-NEXT: v_readlane_b32 s14, v2, 11 +; CHECK-NEXT: v_readlane_b32 s15, v2, 12 +; CHECK-NEXT: v_readlane_b32 s16, v2, 0 +; CHECK-NEXT: v_readlane_b32 s17, v2, 1 +; CHECK-NEXT: v_readlane_b32 s18, v2, 2 +; CHECK-NEXT: v_readlane_b32 s19, v2, 3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -137,7 +150,9 @@ ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_readlane_b32 s4, v16, 4 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v0, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -146,7 +161,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: s_or_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -238,8 +238,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -21,6 +21,7 @@ ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[16:17] +; FIXEDABI-NEXT: ; implicit-def: $vgpr40 ; FIXEDABI-NEXT: v_writelane_b32 v41, s33, 0 ; FIXEDABI-NEXT: s_mov_b32 s33, s32 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -902,6 +902,11 @@ ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: v_writelane_b32 v0, s30, 0 ; CHECK-NEXT: v_writelane_b32 v0, s31, 1 @@ -978,9 +983,6 @@ ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 ; CHECK-NEXT: s_cmp_eq_u32 s31, 0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -8,12 +8,12 @@ @alias = hidden alias void (), void ()* @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 41 +; GFX908: .amdhsa_next_free_vgpr 32 ; GFX908-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A: .amdhsa_next_free_vgpr 71 +; GFX90A: .amdhsa_next_free_vgpr 59 ; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A-NEXT: .amdhsa_accum_offset 44 +; GFX90A-NEXT: .amdhsa_accum_offset 32 define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,7 +9,7 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 42 +; CHECK: .amdhsa_next_free_vgpr 41 ; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel1() #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -26,9 +26,9 @@ ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; FLATSCR: scratch_store_dword +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 -; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s34, 2 ; GCN: v_writelane_b32 v40, s35, 3 diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -222,6 +222,7 @@ ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s42, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; clobber s42 @@ -277,8 +278,8 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-60: v_writelane_b32 v0 ; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 +; GCN-COUNT-60: v_writelane_b32 v0 ; GCN: s_mov_b32 s33, s32 ; GCN: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -323,10 +324,10 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-61: v_writelane_b32 v0, +; MUBUF: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-COUNT-61: v_writelane_b32 v0, ; FLATSCR-NEXT: s_mov_b32 s33, s32 -; MUBUF: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill @@ -397,6 +398,7 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 @@ -440,6 +442,7 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN-NEXT: ; implicit-def: $vgpr48 ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x300{{$}} @@ -488,10 +491,12 @@ ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; MUBUF-NEXT: ; implicit-def: $vgpr48 ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}} ; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}} +; FLATSCR-DAG: ; implicit-def: $vgpr48 ; MUBUF-DAG: buffer_store_dword ; FLATSCR-DAG: scratch_store_dword @@ -626,14 +631,14 @@ ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; GCN-NOT: v_writelane_b32 v40, s33 +; GCN-NOT: v_writelane_b32 v48, s33 ; FLATSCR: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR: s_mov_b64 exec, s[0:1] ; FLATSCR: s_mov_b32 s0, s33 ; FLATSCR: s_mov_b32 s33, s32 ; FLATSCR: s_mov_b32 s33, s0 ; FLATSCR: s_or_saveexec_b64 s[0:1], -1 -; GCN-NOT: v_readlane_b32 s33, v40 +; GCN-NOT: v_readlane_b32 s33, v48 ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", @@ -664,7 +669,7 @@ ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-DAG: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -19,14 +19,14 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -84,14 +84,14 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -162,7 +162,7 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -171,7 +171,7 @@ ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] +; GCN-O0-DAG: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: s_branch [[TEMP_BB:.LBB[0-9_]+]] ; GCN-O0-NEXT: {{^}}[[THEN_INNER]]: @@ -181,7 +181,7 @@ ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] +; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: store_dword @@ -262,7 +262,7 @@ ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] +; GCN-O0-DAG: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: s_branch [[INNER_IF_OUTER_ELSE:.LBB[0-9_]+]] ; GCN-O0-NEXT: {{^}}[[THEN_OUTER]]: @@ -272,14 +272,14 @@ ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] +; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: store_dword ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -289,7 +289,7 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -359,7 +359,7 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -421,61 +421,73 @@ ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: buffer_store_dword [[VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0: buffer_load_dword -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0: buffer_load_dword [[RESTORED_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 +; GCN-O0: buffer_load_dword [[RESTORED_1_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_mov_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_1_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword [[RESTORED_2_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_2_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_2_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_2_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW2:.LBB[0-9_]+]] ; GCN-O0: {{^}}[[FLOW2]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword [[RESTORED_3_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_3_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_3_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] ; GCN-O0: s_branch [[FLOW:.LBB[0-9_]+]] ; GCN-O0: {{^}}[[FLOW]]: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_3_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW3:.LBB[0-9_]+]] ; GCN-O0: ; %bb.{{[0-9]+}}: -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0: buffer_load_dword [[RESTORED_4_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_4_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0: {{^}}[[FLOW3]]: +; GCN-O0: buffer_load_dword [[RESTORED_5_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:68 ; GCN-O0-COUNT-4: buffer_load_dword -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] ; GCN-O0: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-COUNT-2: s_mov_b64 -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] ; GCN-O0-COUNT-4: buffer_store_dword ; GCN-O0: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -10,7 +10,7 @@ ; GCN-LABEL: {{^}}divergent_if_endif: -; VGPR: workitem_private_segment_byte_size = 12{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: @@ -82,7 +82,7 @@ } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 20{{$}} ; GCN: {{^}}; %bb.0: ; GCN-DAG: s_mov_b32 m0, -1 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,6 +31,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 @@ -67,6 +68,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 @@ -103,6 +105,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 @@ -139,6 +142,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir @@ -19,10 +19,11 @@ ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, killed $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, killed $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, killed $vgpr0 + ; CHECK-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, killed $vgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -17,26 +17,27 @@ ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: ; implicit-def: $vgpr41 ; CHECK-NEXT: v_writelane_b32 v42, s33, 0 +; CHECK-NEXT: v_writelane_b32 v41, s30, 0 +; CHECK-NEXT: v_writelane_b32 v41, s31, 1 +; CHECK-NEXT: v_writelane_b32 v41, s34, 2 +; CHECK-NEXT: v_writelane_b32 v41, s35, 3 +; CHECK-NEXT: v_writelane_b32 v41, s36, 4 +; CHECK-NEXT: v_writelane_b32 v41, s37, 5 +; CHECK-NEXT: v_writelane_b32 v41, s38, 6 +; CHECK-NEXT: v_writelane_b32 v41, s39, 7 +; CHECK-NEXT: v_writelane_b32 v41, s40, 8 +; CHECK-NEXT: v_writelane_b32 v41, s41, 9 +; CHECK-NEXT: v_writelane_b32 v41, s42, 10 +; CHECK-NEXT: v_writelane_b32 v41, s43, 11 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_writelane_b32 v40, s46, 13 +; CHECK-NEXT: v_writelane_b32 v41, s44, 12 +; CHECK-NEXT: v_writelane_b32 v41, s46, 13 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: @@ -44,11 +45,11 @@ ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: v_writelane_b32 v40, s47, 14 +; CHECK-NEXT: v_writelane_b32 v41, s47, 14 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b32 s42, s14 ; CHECK-NEXT: s_mov_b32 s43, s13 ; CHECK-NEXT: s_mov_b32 s44, s12 @@ -64,33 +65,33 @@ ; CHECK-NEXT: s_mov_b32 s12, s44 ; CHECK-NEXT: s_mov_b32 s13, s43 ; CHECK-NEXT: s_mov_b32 s14, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_readlane_b32 s47, v40, 14 -; CHECK-NEXT: v_readlane_b32 s46, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s47, v41, 14 +; CHECK-NEXT: v_readlane_b32 s46, v41, 13 +; CHECK-NEXT: v_readlane_b32 s44, v41, 12 +; CHECK-NEXT: v_readlane_b32 s43, v41, 11 +; CHECK-NEXT: v_readlane_b32 s42, v41, 10 +; CHECK-NEXT: v_readlane_b32 s41, v41, 9 +; CHECK-NEXT: v_readlane_b32 s40, v41, 8 +; CHECK-NEXT: v_readlane_b32 s39, v41, 7 +; CHECK-NEXT: v_readlane_b32 s38, v41, 6 +; CHECK-NEXT: v_readlane_b32 s37, v41, 5 +; CHECK-NEXT: v_readlane_b32 s36, v41, 4 +; CHECK-NEXT: v_readlane_b32 s35, v41, 3 +; CHECK-NEXT: v_readlane_b32 s34, v41, 2 +; CHECK-NEXT: v_readlane_b32 s31, v41, 1 +; CHECK-NEXT: v_readlane_b32 s30, v41, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: v_readlane_b32 s33, v42, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -118,26 +118,19 @@ ; FLAT_SCR_OPT-NEXT: s_addc_u32 s3, s3, 0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLAT_SCR_OPT-NEXT: s_clause 0x1 ; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s104, exec_lo -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s105, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s105 -; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s2, 0 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s105, 4 -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s3, 1 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s105 ; 4-byte Folded Spill -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s105, 0 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v72, off, s105 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s104 ; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 +; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s104, 4 ; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 +; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s104 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -220,8 +213,6 @@ ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, vcc_lo ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -230,48 +221,30 @@ ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v2, s3 -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v2, 0 -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v2, 1 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, vcc_lo ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2 +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 ; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: +; FLAT_SCR_ARCH-NEXT: s_clause 0x1 ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s104, exec_lo -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s105, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s105 -; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s2, 0 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s105, 4 -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s3, 1 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s105 ; 4-byte Folded Spill -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s105, 0 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v72, off, s105 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s104 ; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 +; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s104, 4 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 +; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s104 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART @@ -354,8 +327,6 @@ ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, vcc_lo ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART @@ -364,22 +335,11 @@ ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v2, s3 -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v2, 0 -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v2, 1 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, vcc_lo ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2 +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 ; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_ARCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,14 +12,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo + ; CHECK: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_lo @@ -38,14 +37,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi + ; CHECK: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_hi @@ -64,17 +62,16 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $exec + ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec @@ -96,13 +93,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo @@ -120,13 +116,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi @@ -144,16 +139,15 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,14 +13,13 @@ bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $m0 + ; CHECK: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -44,13 +43,12 @@ bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -14,6 +14,7 @@ ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; SPILL-TO-VGPR-NEXT: ; implicit-def: $vgpr40 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s33, 0 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -11,6 +11,8 @@ ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SDAG-NEXT: s_mov_b64 exec, s[34:35] +; SDAG-NEXT: ; implicit-def: $vgpr40 +; SDAG-NEXT: s_mov_b32 s36, s33 ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 ; SDAG-NEXT: v_writelane_b32 v40, s6, 2 @@ -31,7 +33,6 @@ ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 -; SDAG-NEXT: s_mov_b32 s36, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 @@ -91,6 +92,8 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[34:35] +; GISEL-NEXT: ; implicit-def: $vgpr40 +; GISEL-NEXT: s_mov_b32 s36, s33 ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 ; GISEL-NEXT: v_writelane_b32 v40, s6, 2 @@ -111,7 +114,6 @@ ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 -; GISEL-NEXT: s_mov_b32 s36, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -100,6 +100,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -131,15 +132,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -164,15 +166,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 @@ -201,6 +204,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -209,6 +213,7 @@ ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -234,14 +239,16 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -269,14 +276,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -307,6 +316,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -315,6 +325,7 @@ ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -340,14 +351,16 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -375,14 +388,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -411,6 +426,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -441,15 +457,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -473,15 +490,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -509,6 +527,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -540,8 +559,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -573,8 +593,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -609,6 +630,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -640,8 +662,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -673,8 +696,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -707,6 +731,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -737,15 +762,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -769,15 +795,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -805,6 +832,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -836,8 +864,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -869,8 +898,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -905,6 +935,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -936,8 +967,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -969,8 +1001,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -1003,6 +1036,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1033,15 +1067,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -1065,15 +1100,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -1099,6 +1135,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1130,8 +1167,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -1163,8 +1201,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -1201,6 +1240,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1232,15 +1272,16 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -1266,15 +1307,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -1301,6 +1343,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1334,8 +1377,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -1369,8 +1413,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -1409,6 +1454,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1442,8 +1488,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -1478,8 +1525,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -1520,6 +1568,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1555,8 +1604,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -1593,8 +1643,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -1633,6 +1684,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1663,15 +1715,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -1695,15 +1748,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -1729,6 +1783,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1759,15 +1814,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -1791,15 +1847,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -1825,6 +1882,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1856,8 +1914,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -1889,8 +1948,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -1924,6 +1984,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1956,8 +2017,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 @@ -1990,8 +2052,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 @@ -2026,6 +2089,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2060,8 +2124,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 @@ -2096,8 +2161,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 @@ -2134,6 +2200,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2165,8 +2232,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -2198,8 +2266,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -2233,6 +2302,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2266,8 +2336,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -2301,8 +2372,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -2338,6 +2410,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2373,8 +2446,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -2410,8 +2484,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -2450,6 +2525,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2480,8 +2556,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2512,8 +2589,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2547,6 +2625,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2577,8 +2656,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2609,8 +2689,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2644,6 +2725,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2674,8 +2756,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2706,8 +2789,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2740,6 +2824,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2771,8 +2856,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -2804,8 +2890,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -2839,6 +2926,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2870,8 +2958,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -2903,8 +2992,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -2939,6 +3029,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -2969,8 +3060,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -3001,8 +3093,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -3035,6 +3128,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3066,8 +3160,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -3099,8 +3194,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -3135,6 +3231,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3165,8 +3262,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -3197,8 +3295,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -3232,6 +3331,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3262,8 +3362,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -3294,8 +3395,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -3328,6 +3430,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3359,8 +3462,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -3392,8 +3496,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 @@ -3427,6 +3532,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3459,8 +3565,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 @@ -3493,8 +3600,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 @@ -3529,6 +3637,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3562,8 +3671,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 @@ -3597,8 +3707,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 @@ -3635,6 +3746,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3665,8 +3777,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -3697,8 +3810,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -3731,6 +3845,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3764,8 +3879,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -3799,8 +3915,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -3836,6 +3953,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3870,8 +3988,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -3906,8 +4025,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -3946,6 +4066,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -3980,8 +4101,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4017,8 +4139,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -4056,6 +4179,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -4093,8 +4217,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -4132,8 +4257,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -4175,6 +4301,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -4211,8 +4338,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4250,8 +4378,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -4293,6 +4422,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -4334,8 +4464,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4377,8 +4508,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -4425,6 +4557,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4468,8 +4601,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off @@ -4514,8 +4648,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off @@ -4560,33 +4695,34 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v43, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v42, v1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: global_store_dword v[41:42], v0, off +; GFX9-NEXT: global_store_dword v[40:41], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v43, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4597,36 +4733,37 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v43, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-NEXT: ; implicit-def: $vgpr42 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v42, v1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: global_store_dword v[41:42], v0, off +; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: v_readlane_b32 s33, v43, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 @@ -4638,36 +4775,37 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s32 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr42 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v43, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s32 offset:8 ; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 @@ -4688,6 +4826,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -4722,8 +4861,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4759,8 +4899,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -4801,6 +4942,7 @@ ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x800 @@ -4836,11 +4978,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4872,11 +5015,12 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4916,6 +5060,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: s_addk_i32 s32, 0x800 @@ -4958,17 +5103,18 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 @@ -5009,9 +5155,10 @@ ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_add_i32 vcc_lo, s33, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, vcc_lo ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s33 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -5063,6 +5210,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5115,8 +5263,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -5170,8 +5319,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -5231,6 +5381,8 @@ ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 @@ -5261,7 +5413,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s59, 27 ; GFX9-NEXT: v_writelane_b32 v40, s60, 28 ; GFX9-NEXT: v_writelane_b32 v40, s61, 29 -; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s62, 30 ; GFX9-NEXT: v_writelane_b32 v40, s63, 31 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -5326,8 +5477,9 @@ ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 @@ -5421,8 +5573,9 @@ ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -5515,6 +5668,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5546,15 +5700,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -5579,15 +5734,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 @@ -5614,8 +5770,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -5646,15 +5803,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -5680,15 +5838,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -5716,8 +5875,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -5748,15 +5908,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -5782,15 +5943,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -5818,8 +5980,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -5850,15 +6013,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 42 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -5884,15 +6048,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -5920,9 +6085,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -5955,16 +6121,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -5992,16 +6159,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6031,10 +6199,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 @@ -6070,8 +6239,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -6111,8 +6281,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -6155,11 +6326,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -6196,16 +6368,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -6239,16 +6412,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -6284,10 +6458,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 @@ -6329,8 +6504,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -6376,8 +6552,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -6428,11 +6605,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 @@ -6479,8 +6657,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -6532,8 +6711,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -6589,8 +6769,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -6621,15 +6802,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -6655,15 +6837,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -6691,8 +6874,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -6723,15 +6907,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 4.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -6757,15 +6942,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -6793,9 +6979,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -6828,16 +7015,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -6865,16 +7053,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6904,10 +7093,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 @@ -6942,16 +7132,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -6982,16 +7173,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -7024,12 +7216,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 @@ -7068,16 +7261,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7114,16 +7308,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7162,9 +7357,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7197,16 +7393,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7234,16 +7431,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7273,11 +7471,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -7314,16 +7513,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7357,16 +7557,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7402,13 +7603,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 @@ -7449,16 +7651,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7498,16 +7701,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7549,9 +7753,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -7581,15 +7786,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -7615,15 +7821,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -7652,10 +7859,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7686,8 +7894,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -7722,8 +7931,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -7761,10 +7971,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7795,8 +8006,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -7831,8 +8043,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -7870,9 +8083,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7905,16 +8119,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7942,16 +8157,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7981,9 +8197,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8016,16 +8233,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -8053,16 +8271,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8092,10 +8311,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8126,8 +8346,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -8162,8 +8383,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -8201,9 +8423,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8236,16 +8459,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -8273,16 +8497,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8312,9 +8537,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -8344,15 +8570,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 @@ -8378,15 +8605,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 @@ -8415,10 +8643,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8449,8 +8678,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -8485,8 +8715,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -8524,9 +8755,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8559,16 +8791,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -8596,16 +8829,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8635,10 +8869,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 @@ -8673,16 +8908,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -8713,16 +8949,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -8755,11 +8992,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -8796,16 +9034,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8839,16 +9078,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8884,12 +9124,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -8922,8 +9163,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -8962,8 +9204,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -9005,11 +9248,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -9046,16 +9290,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9089,16 +9334,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9134,12 +9380,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 @@ -9178,16 +9425,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9224,16 +9472,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9272,10 +9521,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 @@ -9320,8 +9570,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -9370,8 +9621,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -9424,6 +9676,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9432,7 +9686,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -9477,16 +9730,17 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9532,16 +9786,17 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9589,6 +9844,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9600,7 +9857,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 @@ -9653,8 +9909,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -9719,8 +9976,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -9789,6 +10047,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9805,7 +10065,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 @@ -9897,8 +10156,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -10008,8 +10268,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -10119,6 +10380,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -10134,7 +10397,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 @@ -10232,8 +10494,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -10348,8 +10611,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -10469,6 +10733,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 @@ -10505,8 +10770,9 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 @@ -10540,9 +10806,10 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 @@ -10583,6 +10850,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -10651,11 +10919,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -10722,13 +10991,14 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2 @@ -10814,6 +11084,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -10888,6 +11159,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 ; GFX10-NEXT: v_mov_b32_e32 v3, 14 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -10965,9 +11237,10 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -11054,6 +11327,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -11128,6 +11402,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -11205,9 +11480,10 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -12,9 +12,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -48,8 +49,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -89,6 +91,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s28, 0 ; GFX9-NEXT: v_writelane_b32 v0, s29, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -110,6 +113,7 @@ ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s28, 0 ; GFX10-NEXT: v_writelane_b32 v0, s29, 1 ; GFX10-NEXT: ;;#ASMSTART @@ -141,8 +145,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -180,8 +185,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -222,34 +228,35 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v41, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v31 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mov_b32_e32 v31, v41 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v41, 1 +; GFX9-NEXT: v_readlane_b32 s30, v41, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -260,36 +267,37 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr41 ; GFX10-NEXT: v_writelane_b32 v42, s33, 0 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v41, v31 +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v40, v31 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_mov_b32_e32 v31, v41 +; GFX10-NEXT: v_mov_b32_e32 v31, v40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v41, 1 +; GFX10-NEXT: v_readlane_b32 s30, v41, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 @@ -310,8 +318,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -349,17 +358,18 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -394,8 +404,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -433,17 +444,18 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s34 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] @@ -478,6 +490,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -515,15 +528,16 @@ ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr41 ; GFX10-NEXT: v_writelane_b32 v42, s33, 0 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -557,6 +571,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber @@ -576,6 +591,7 @@ ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber @@ -599,6 +615,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber @@ -618,6 +635,7 @@ ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber @@ -642,6 +660,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -671,8 +690,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -704,6 +724,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -733,8 +754,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -766,8 +788,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -804,17 +827,18 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -845,16 +869,17 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: v_writelane_b32 v42, s33, 0 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v41, s30, 1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -862,7 +887,7 @@ ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v32 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v41, v32 +; GFX9-NEXT: v_mov_b32_e32 v40, v32 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -871,16 +896,16 @@ ; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v41 +; GFX9-NEXT: ; use v40 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v41, 2 +; GFX9-NEXT: v_readlane_b32 s30, v41, 1 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -891,44 +916,45 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr41 ; GFX10-NEXT: v_writelane_b32 v42, s33, 0 +; GFX10-NEXT: v_writelane_b32 v41, s4, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v41, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v41, v32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_mov_b32_e32 v40, v32 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v41, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v41 +; GFX10-NEXT: ; use v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v41, 2 +; GFX10-NEXT: v_readlane_b32 s30, v41, 1 +; GFX10-NEXT: v_readlane_b32 s4, v41, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -33,6 +33,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -61,8 +62,9 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -112,6 +114,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +143,9 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -191,6 +195,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -219,8 +224,9 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -272,6 +278,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -300,8 +307,9 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1380,8 +1388,9 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1410,9 +1419,10 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v2, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -400,6 +400,7 @@ ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -480,6 +481,7 @@ ; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -564,6 +566,7 @@ ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -647,6 +650,7 @@ ; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -732,6 +736,7 @@ ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -814,6 +819,7 @@ ; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -901,6 +907,7 @@ ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -992,6 +999,7 @@ ; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1093,6 +1101,7 @@ ; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1187,6 +1196,7 @@ ; GISEL-NEXT: s_mov_b32 s5, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1280,96 +1290,97 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 -; GCN-NEXT: v_writelane_b32 v40, s60, 28 -; GCN-NEXT: v_writelane_b32 v40, s61, 29 -; GCN-NEXT: v_writelane_b32 v40, s62, 30 -; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: v_writelane_b32 v41, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s31, 1 +; GCN-NEXT: v_writelane_b32 v41, s34, 2 +; GCN-NEXT: v_writelane_b32 v41, s35, 3 +; GCN-NEXT: v_writelane_b32 v41, s36, 4 +; GCN-NEXT: v_writelane_b32 v41, s37, 5 +; GCN-NEXT: v_writelane_b32 v41, s38, 6 +; GCN-NEXT: v_writelane_b32 v41, s39, 7 +; GCN-NEXT: v_writelane_b32 v41, s40, 8 +; GCN-NEXT: v_writelane_b32 v41, s41, 9 +; GCN-NEXT: v_writelane_b32 v41, s42, 10 +; GCN-NEXT: v_writelane_b32 v41, s43, 11 +; GCN-NEXT: v_writelane_b32 v41, s44, 12 +; GCN-NEXT: v_writelane_b32 v41, s45, 13 +; GCN-NEXT: v_writelane_b32 v41, s46, 14 +; GCN-NEXT: v_writelane_b32 v41, s47, 15 +; GCN-NEXT: v_writelane_b32 v41, s48, 16 +; GCN-NEXT: v_writelane_b32 v41, s49, 17 +; GCN-NEXT: v_writelane_b32 v41, s50, 18 +; GCN-NEXT: v_writelane_b32 v41, s51, 19 +; GCN-NEXT: v_writelane_b32 v41, s52, 20 +; GCN-NEXT: v_writelane_b32 v41, s53, 21 +; GCN-NEXT: v_writelane_b32 v41, s54, 22 +; GCN-NEXT: v_writelane_b32 v41, s55, 23 +; GCN-NEXT: v_writelane_b32 v41, s56, 24 +; GCN-NEXT: v_writelane_b32 v41, s57, 25 +; GCN-NEXT: v_writelane_b32 v41, s58, 26 +; GCN-NEXT: v_writelane_b32 v41, s59, 27 +; GCN-NEXT: v_writelane_b32 v41, s60, 28 +; GCN-NEXT: v_writelane_b32 v41, s61, 29 +; GCN-NEXT: v_writelane_b32 v41, s62, 30 +; GCN-NEXT: v_writelane_b32 v41, s63, 31 +; GCN-NEXT: v_mov_b32_e32 v40, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v2 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_mov_b32_e32 v0, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz .LBB7_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: v_readlane_b32 s63, v40, 31 -; GCN-NEXT: v_readlane_b32 s62, v40, 30 -; GCN-NEXT: v_readlane_b32 s61, v40, 29 -; GCN-NEXT: v_readlane_b32 s60, v40, 28 -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_readlane_b32 s63, v41, 31 +; GCN-NEXT: v_readlane_b32 s62, v41, 30 +; GCN-NEXT: v_readlane_b32 s61, v41, 29 +; GCN-NEXT: v_readlane_b32 s60, v41, 28 +; GCN-NEXT: v_readlane_b32 s59, v41, 27 +; GCN-NEXT: v_readlane_b32 s58, v41, 26 +; GCN-NEXT: v_readlane_b32 s57, v41, 25 +; GCN-NEXT: v_readlane_b32 s56, v41, 24 +; GCN-NEXT: v_readlane_b32 s55, v41, 23 +; GCN-NEXT: v_readlane_b32 s54, v41, 22 +; GCN-NEXT: v_readlane_b32 s53, v41, 21 +; GCN-NEXT: v_readlane_b32 s52, v41, 20 +; GCN-NEXT: v_readlane_b32 s51, v41, 19 +; GCN-NEXT: v_readlane_b32 s50, v41, 18 +; GCN-NEXT: v_readlane_b32 s49, v41, 17 +; GCN-NEXT: v_readlane_b32 s48, v41, 16 +; GCN-NEXT: v_readlane_b32 s47, v41, 15 +; GCN-NEXT: v_readlane_b32 s46, v41, 14 +; GCN-NEXT: v_readlane_b32 s45, v41, 13 +; GCN-NEXT: v_readlane_b32 s44, v41, 12 +; GCN-NEXT: v_readlane_b32 s43, v41, 11 +; GCN-NEXT: v_readlane_b32 s42, v41, 10 +; GCN-NEXT: v_readlane_b32 s41, v41, 9 +; GCN-NEXT: v_readlane_b32 s40, v41, 8 +; GCN-NEXT: v_readlane_b32 s39, v41, 7 +; GCN-NEXT: v_readlane_b32 s38, v41, 6 +; GCN-NEXT: v_readlane_b32 s37, v41, 5 +; GCN-NEXT: v_readlane_b32 s36, v41, 4 +; GCN-NEXT: v_readlane_b32 s35, v41, 3 +; GCN-NEXT: v_readlane_b32 s34, v41, 2 +; GCN-NEXT: v_readlane_b32 s31, v41, 1 +; GCN-NEXT: v_readlane_b32 s30, v41, 0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1378,96 +1389,97 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: v_writelane_b32 v40, s52, 20 -; GISEL-NEXT: v_writelane_b32 v40, s53, 21 -; GISEL-NEXT: v_writelane_b32 v40, s54, 22 -; GISEL-NEXT: v_writelane_b32 v40, s55, 23 -; GISEL-NEXT: v_writelane_b32 v40, s56, 24 -; GISEL-NEXT: v_writelane_b32 v40, s57, 25 -; GISEL-NEXT: v_writelane_b32 v40, s58, 26 -; GISEL-NEXT: v_writelane_b32 v40, s59, 27 -; GISEL-NEXT: v_writelane_b32 v40, s60, 28 -; GISEL-NEXT: v_writelane_b32 v40, s61, 29 -; GISEL-NEXT: v_writelane_b32 v40, s62, 30 -; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: v_mov_b32_e32 v41, v0 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: ; implicit-def: $vgpr41 +; GISEL-NEXT: v_writelane_b32 v41, s30, 0 +; GISEL-NEXT: v_writelane_b32 v41, s31, 1 +; GISEL-NEXT: v_writelane_b32 v41, s34, 2 +; GISEL-NEXT: v_writelane_b32 v41, s35, 3 +; GISEL-NEXT: v_writelane_b32 v41, s36, 4 +; GISEL-NEXT: v_writelane_b32 v41, s37, 5 +; GISEL-NEXT: v_writelane_b32 v41, s38, 6 +; GISEL-NEXT: v_writelane_b32 v41, s39, 7 +; GISEL-NEXT: v_writelane_b32 v41, s40, 8 +; GISEL-NEXT: v_writelane_b32 v41, s41, 9 +; GISEL-NEXT: v_writelane_b32 v41, s42, 10 +; GISEL-NEXT: v_writelane_b32 v41, s43, 11 +; GISEL-NEXT: v_writelane_b32 v41, s44, 12 +; GISEL-NEXT: v_writelane_b32 v41, s45, 13 +; GISEL-NEXT: v_writelane_b32 v41, s46, 14 +; GISEL-NEXT: v_writelane_b32 v41, s47, 15 +; GISEL-NEXT: v_writelane_b32 v41, s48, 16 +; GISEL-NEXT: v_writelane_b32 v41, s49, 17 +; GISEL-NEXT: v_writelane_b32 v41, s50, 18 +; GISEL-NEXT: v_writelane_b32 v41, s51, 19 +; GISEL-NEXT: v_writelane_b32 v41, s52, 20 +; GISEL-NEXT: v_writelane_b32 v41, s53, 21 +; GISEL-NEXT: v_writelane_b32 v41, s54, 22 +; GISEL-NEXT: v_writelane_b32 v41, s55, 23 +; GISEL-NEXT: v_writelane_b32 v41, s56, 24 +; GISEL-NEXT: v_writelane_b32 v41, s57, 25 +; GISEL-NEXT: v_writelane_b32 v41, s58, 26 +; GISEL-NEXT: v_writelane_b32 v41, s59, 27 +; GISEL-NEXT: v_writelane_b32 v41, s60, 28 +; GISEL-NEXT: v_writelane_b32 v41, s61, 29 +; GISEL-NEXT: v_writelane_b32 v41, s62, 30 +; GISEL-NEXT: v_writelane_b32 v41, s63, 31 +; GISEL-NEXT: v_mov_b32_e32 v40, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v1 ; GISEL-NEXT: v_readfirstlane_b32 s7, v2 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v41 +; GISEL-NEXT: v_mov_b32_e32 v0, v40 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB7_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v0, v41 -; GISEL-NEXT: v_readlane_b32 s63, v40, 31 -; GISEL-NEXT: v_readlane_b32 s62, v40, 30 -; GISEL-NEXT: v_readlane_b32 s61, v40, 29 -; GISEL-NEXT: v_readlane_b32 s60, v40, 28 -; GISEL-NEXT: v_readlane_b32 s59, v40, 27 -; GISEL-NEXT: v_readlane_b32 s58, v40, 26 -; GISEL-NEXT: v_readlane_b32 s57, v40, 25 -; GISEL-NEXT: v_readlane_b32 s56, v40, 24 -; GISEL-NEXT: v_readlane_b32 s55, v40, 23 -; GISEL-NEXT: v_readlane_b32 s54, v40, 22 -; GISEL-NEXT: v_readlane_b32 s53, v40, 21 -; GISEL-NEXT: v_readlane_b32 s52, v40, 20 -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: v_mov_b32_e32 v0, v40 +; GISEL-NEXT: v_readlane_b32 s63, v41, 31 +; GISEL-NEXT: v_readlane_b32 s62, v41, 30 +; GISEL-NEXT: v_readlane_b32 s61, v41, 29 +; GISEL-NEXT: v_readlane_b32 s60, v41, 28 +; GISEL-NEXT: v_readlane_b32 s59, v41, 27 +; GISEL-NEXT: v_readlane_b32 s58, v41, 26 +; GISEL-NEXT: v_readlane_b32 s57, v41, 25 +; GISEL-NEXT: v_readlane_b32 s56, v41, 24 +; GISEL-NEXT: v_readlane_b32 s55, v41, 23 +; GISEL-NEXT: v_readlane_b32 s54, v41, 22 +; GISEL-NEXT: v_readlane_b32 s53, v41, 21 +; GISEL-NEXT: v_readlane_b32 s52, v41, 20 +; GISEL-NEXT: v_readlane_b32 s51, v41, 19 +; GISEL-NEXT: v_readlane_b32 s50, v41, 18 +; GISEL-NEXT: v_readlane_b32 s49, v41, 17 +; GISEL-NEXT: v_readlane_b32 s48, v41, 16 +; GISEL-NEXT: v_readlane_b32 s47, v41, 15 +; GISEL-NEXT: v_readlane_b32 s46, v41, 14 +; GISEL-NEXT: v_readlane_b32 s45, v41, 13 +; GISEL-NEXT: v_readlane_b32 s44, v41, 12 +; GISEL-NEXT: v_readlane_b32 s43, v41, 11 +; GISEL-NEXT: v_readlane_b32 s42, v41, 10 +; GISEL-NEXT: v_readlane_b32 s41, v41, 9 +; GISEL-NEXT: v_readlane_b32 s40, v41, 8 +; GISEL-NEXT: v_readlane_b32 s39, v41, 7 +; GISEL-NEXT: v_readlane_b32 s38, v41, 6 +; GISEL-NEXT: v_readlane_b32 s37, v41, 5 +; GISEL-NEXT: v_readlane_b32 s36, v41, 4 +; GISEL-NEXT: v_readlane_b32 s35, v41, 3 +; GISEL-NEXT: v_readlane_b32 s34, v41, 2 +; GISEL-NEXT: v_readlane_b32 s31, v41, 1 +; GISEL-NEXT: v_readlane_b32 s30, v41, 0 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1489,6 +1501,7 @@ ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1585,6 +1598,7 @@ ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1686,6 +1700,7 @@ ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1779,6 +1794,7 @@ ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -262,7 +262,10 @@ ; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] ; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1 -; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; W64-O0: buffer_load_dword +; W64-O0: buffer_store_dword +; W64-O0: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; W64-O0: buffer_load_dword ; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] @@ -288,6 +291,8 @@ ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] +; W64-O0: buffer_store_dword +; W64-O0: buffer_load_dword ; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -189,45 +189,46 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: v_writelane_b32 v44, s33, 0 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s35, 3 +; GFX9-NEXT: v_writelane_b32 v42, s35, 3 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: v_mov_b32_e32 v42, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 -; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 -; GFX9-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v43 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 +; GFX9-NEXT: v_add_u32_e32 v0, v40, v43 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s35, v40, 3 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s35, v42, 3 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -28,14 +28,16 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v1, s30, 0 -; CHECK-NEXT: v_writelane_b32 v1, s31, 1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 @@ -44,16 +46,18 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 -; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s31, v0, 1 +; CHECK-NEXT: v_readlane_b32 s30, v0, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -91,20 +95,21 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v1, s33, 0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s33, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 -; CHECK-NEXT: v_readlane_b32 s33, v1, 0 +; CHECK-NEXT: v_readlane_b32 s33, v0, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[4:5] bb: @@ -158,6 +163,7 @@ ; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 +; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -187,13 +193,15 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s7, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v2, s30, 0 -; CHECK-NEXT: v_writelane_b32 v2, s31, 1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12 @@ -202,12 +210,14 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: v_readlane_b32 s31, v2, 1 -; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 +; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s7 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -14,16 +14,18 @@ ; CHECK-NEXT: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v41, s33, 0 +; CHECK-NEXT: v_writelane_b32 v40, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 +; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _ZL13sleep_foreverv@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _ZL13sleep_foreverv@gotpcrel32@hi+12 @@ -35,14 +37,16 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s31, v0, 1 +; CHECK-NEXT: v_readlane_b32 s30, v0, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v41, 0 +; CHECK-NEXT: v_readlane_b32 s33, v40, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -11,10 +11,17 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 { ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s94, -1 +; GCN-NEXT: s_mov_b32 s95, 0xe8f000 +; GCN-NEXT: s_add_u32 s92, s92, s3 +; GCN-NEXT: s_addc_u32 s93, s93, 0 ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -100,264 +107,273 @@ ; GCN-NEXT: v_writelane_b32 v0, s9, 61 ; GCN-NEXT: v_writelane_b32 v0, s10, 62 ; GCN-NEXT: v_writelane_b32 v0, s11, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 8 -; GCN-NEXT: v_writelane_b32 v1, s5, 9 -; GCN-NEXT: v_writelane_b32 v1, s6, 10 -; GCN-NEXT: v_writelane_b32 v1, s7, 11 -; GCN-NEXT: v_writelane_b32 v1, s8, 12 -; GCN-NEXT: v_writelane_b32 v1, s9, 13 -; GCN-NEXT: v_writelane_b32 v1, s10, 14 -; GCN-NEXT: v_writelane_b32 v1, s11, 15 +; GCN-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-NEXT: v_writelane_b32 v0, s8, 12 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 24 -; GCN-NEXT: v_writelane_b32 v1, s5, 25 -; GCN-NEXT: v_writelane_b32 v1, s6, 26 -; GCN-NEXT: v_writelane_b32 v1, s7, 27 -; GCN-NEXT: v_writelane_b32 v1, s8, 28 -; GCN-NEXT: v_writelane_b32 v1, s9, 29 -; GCN-NEXT: v_writelane_b32 v1, s10, 30 -; GCN-NEXT: v_writelane_b32 v1, s11, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s5, 25 +; GCN-NEXT: v_writelane_b32 v0, s6, 26 +; GCN-NEXT: v_writelane_b32 v0, s7, 27 +; GCN-NEXT: v_writelane_b32 v0, s8, 28 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 40 -; GCN-NEXT: v_writelane_b32 v1, s5, 41 -; GCN-NEXT: v_writelane_b32 v1, s6, 42 -; GCN-NEXT: v_writelane_b32 v1, s7, 43 -; GCN-NEXT: v_writelane_b32 v1, s8, 44 -; GCN-NEXT: v_writelane_b32 v1, s9, 45 -; GCN-NEXT: v_writelane_b32 v1, s10, 46 -; GCN-NEXT: v_writelane_b32 v1, s11, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 40 +; GCN-NEXT: v_writelane_b32 v0, s5, 41 +; GCN-NEXT: v_writelane_b32 v0, s6, 42 +; GCN-NEXT: v_writelane_b32 v0, s7, 43 +; GCN-NEXT: v_writelane_b32 v0, s8, 44 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 56 -; GCN-NEXT: v_writelane_b32 v1, s5, 57 -; GCN-NEXT: v_writelane_b32 v1, s6, 58 -; GCN-NEXT: v_writelane_b32 v1, s7, 59 -; GCN-NEXT: v_writelane_b32 v1, s8, 60 -; GCN-NEXT: v_writelane_b32 v1, s9, 61 -; GCN-NEXT: v_writelane_b32 v1, s10, 62 -; GCN-NEXT: v_writelane_b32 v1, s11, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 56 +; GCN-NEXT: v_writelane_b32 v0, s5, 57 +; GCN-NEXT: v_writelane_b32 v0, s6, 58 +; GCN-NEXT: v_writelane_b32 v0, s7, 59 +; GCN-NEXT: v_writelane_b32 v0, s8, 60 +; GCN-NEXT: v_writelane_b32 v0, s9, 61 +; GCN-NEXT: v_writelane_b32 v0, s10, 62 +; GCN-NEXT: v_writelane_b32 v0, s11, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 0 -; GCN-NEXT: v_writelane_b32 v2, s5, 1 -; GCN-NEXT: v_writelane_b32 v2, s6, 2 -; GCN-NEXT: v_writelane_b32 v2, s7, 3 -; GCN-NEXT: v_writelane_b32 v2, s8, 4 -; GCN-NEXT: v_writelane_b32 v2, s9, 5 -; GCN-NEXT: v_writelane_b32 v2, s10, 6 -; GCN-NEXT: v_writelane_b32 v2, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s8, v1, 56 -; GCN-NEXT: v_readlane_b32 s9, v1, 57 -; GCN-NEXT: v_readlane_b32 s10, v1, 58 -; GCN-NEXT: v_readlane_b32 s11, v1, 59 -; GCN-NEXT: v_readlane_b32 s12, v1, 60 -; GCN-NEXT: v_readlane_b32 s13, v1, 61 -; GCN-NEXT: v_readlane_b32 s14, v1, 62 -; GCN-NEXT: v_readlane_b32 s15, v1, 63 -; GCN-NEXT: v_readlane_b32 s16, v1, 48 -; GCN-NEXT: v_readlane_b32 s17, v1, 49 -; GCN-NEXT: v_readlane_b32 s18, v1, 50 -; GCN-NEXT: v_readlane_b32 s19, v1, 51 -; GCN-NEXT: v_readlane_b32 s20, v1, 52 -; GCN-NEXT: v_readlane_b32 s21, v1, 53 -; GCN-NEXT: v_readlane_b32 s22, v1, 54 -; GCN-NEXT: v_readlane_b32 s23, v1, 55 -; GCN-NEXT: v_readlane_b32 s24, v1, 40 -; GCN-NEXT: v_readlane_b32 s25, v1, 41 -; GCN-NEXT: v_readlane_b32 s26, v1, 42 -; GCN-NEXT: v_readlane_b32 s27, v1, 43 -; GCN-NEXT: v_readlane_b32 s28, v1, 44 -; GCN-NEXT: v_readlane_b32 s29, v1, 45 -; GCN-NEXT: v_readlane_b32 s30, v1, 46 -; GCN-NEXT: v_readlane_b32 s31, v1, 47 -; GCN-NEXT: v_readlane_b32 s36, v1, 32 -; GCN-NEXT: v_readlane_b32 s37, v1, 33 -; GCN-NEXT: v_readlane_b32 s38, v1, 34 -; GCN-NEXT: v_readlane_b32 s39, v1, 35 -; GCN-NEXT: v_readlane_b32 s40, v1, 36 -; GCN-NEXT: v_readlane_b32 s41, v1, 37 -; GCN-NEXT: v_readlane_b32 s42, v1, 38 -; GCN-NEXT: v_readlane_b32 s43, v1, 39 -; GCN-NEXT: v_readlane_b32 s44, v1, 24 -; GCN-NEXT: v_readlane_b32 s45, v1, 25 -; GCN-NEXT: v_readlane_b32 s46, v1, 26 -; GCN-NEXT: v_readlane_b32 s47, v1, 27 -; GCN-NEXT: v_readlane_b32 s48, v1, 28 -; GCN-NEXT: v_readlane_b32 s49, v1, 29 -; GCN-NEXT: v_readlane_b32 s50, v1, 30 -; GCN-NEXT: v_readlane_b32 s51, v1, 31 -; GCN-NEXT: v_readlane_b32 s52, v1, 16 -; GCN-NEXT: v_readlane_b32 s53, v1, 17 -; GCN-NEXT: v_readlane_b32 s54, v1, 18 -; GCN-NEXT: v_readlane_b32 s55, v1, 19 -; GCN-NEXT: v_readlane_b32 s56, v1, 20 -; GCN-NEXT: v_readlane_b32 s57, v1, 21 -; GCN-NEXT: v_readlane_b32 s58, v1, 22 -; GCN-NEXT: v_readlane_b32 s59, v1, 23 -; GCN-NEXT: v_readlane_b32 s60, v1, 8 -; GCN-NEXT: v_readlane_b32 s61, v1, 9 -; GCN-NEXT: v_readlane_b32 s62, v1, 10 -; GCN-NEXT: v_readlane_b32 s63, v1, 11 -; GCN-NEXT: v_readlane_b32 s64, v1, 12 -; GCN-NEXT: v_readlane_b32 s65, v1, 13 -; GCN-NEXT: v_readlane_b32 s66, v1, 14 -; GCN-NEXT: v_readlane_b32 s67, v1, 15 -; GCN-NEXT: v_readlane_b32 s68, v1, 0 -; GCN-NEXT: v_readlane_b32 s69, v1, 1 -; GCN-NEXT: v_readlane_b32 s70, v1, 2 -; GCN-NEXT: v_readlane_b32 s71, v1, 3 -; GCN-NEXT: v_readlane_b32 s72, v1, 4 -; GCN-NEXT: v_readlane_b32 s73, v1, 5 -; GCN-NEXT: v_readlane_b32 s74, v1, 6 -; GCN-NEXT: v_readlane_b32 s75, v1, 7 -; GCN-NEXT: v_readlane_b32 s76, v0, 56 -; GCN-NEXT: v_readlane_b32 s77, v0, 57 -; GCN-NEXT: v_readlane_b32 s78, v0, 58 -; GCN-NEXT: v_readlane_b32 s79, v0, 59 -; GCN-NEXT: v_readlane_b32 s80, v0, 60 -; GCN-NEXT: v_readlane_b32 s81, v0, 61 -; GCN-NEXT: v_readlane_b32 s82, v0, 62 -; GCN-NEXT: v_readlane_b32 s83, v0, 63 -; GCN-NEXT: v_readlane_b32 s84, v0, 48 -; GCN-NEXT: v_readlane_b32 s85, v0, 49 -; GCN-NEXT: v_readlane_b32 s86, v0, 50 -; GCN-NEXT: v_readlane_b32 s87, v0, 51 -; GCN-NEXT: v_readlane_b32 s88, v0, 52 -; GCN-NEXT: v_readlane_b32 s89, v0, 53 -; GCN-NEXT: v_readlane_b32 s90, v0, 54 -; GCN-NEXT: v_readlane_b32 s91, v0, 55 -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s8, v2, 56 +; GCN-NEXT: v_readlane_b32 s9, v2, 57 +; GCN-NEXT: v_readlane_b32 s10, v2, 58 +; GCN-NEXT: v_readlane_b32 s11, v2, 59 +; GCN-NEXT: v_readlane_b32 s12, v2, 60 +; GCN-NEXT: v_readlane_b32 s13, v2, 61 +; GCN-NEXT: v_readlane_b32 s14, v2, 62 +; GCN-NEXT: v_readlane_b32 s15, v2, 63 +; GCN-NEXT: v_readlane_b32 s16, v2, 48 +; GCN-NEXT: v_readlane_b32 s17, v2, 49 +; GCN-NEXT: v_readlane_b32 s18, v2, 50 +; GCN-NEXT: v_readlane_b32 s19, v2, 51 +; GCN-NEXT: v_readlane_b32 s20, v2, 52 +; GCN-NEXT: v_readlane_b32 s21, v2, 53 +; GCN-NEXT: v_readlane_b32 s22, v2, 54 +; GCN-NEXT: v_readlane_b32 s23, v2, 55 +; GCN-NEXT: v_readlane_b32 s24, v2, 40 +; GCN-NEXT: v_readlane_b32 s25, v2, 41 +; GCN-NEXT: v_readlane_b32 s26, v2, 42 +; GCN-NEXT: v_readlane_b32 s27, v2, 43 +; GCN-NEXT: v_readlane_b32 s28, v2, 44 +; GCN-NEXT: v_readlane_b32 s29, v2, 45 +; GCN-NEXT: v_readlane_b32 s30, v2, 46 +; GCN-NEXT: v_readlane_b32 s31, v2, 47 +; GCN-NEXT: v_readlane_b32 s36, v2, 32 +; GCN-NEXT: v_readlane_b32 s37, v2, 33 +; GCN-NEXT: v_readlane_b32 s38, v2, 34 +; GCN-NEXT: v_readlane_b32 s39, v2, 35 +; GCN-NEXT: v_readlane_b32 s40, v2, 36 +; GCN-NEXT: v_readlane_b32 s41, v2, 37 +; GCN-NEXT: v_readlane_b32 s42, v2, 38 +; GCN-NEXT: v_readlane_b32 s43, v2, 39 +; GCN-NEXT: v_readlane_b32 s44, v2, 24 +; GCN-NEXT: v_readlane_b32 s45, v2, 25 +; GCN-NEXT: v_readlane_b32 s46, v2, 26 +; GCN-NEXT: v_readlane_b32 s47, v2, 27 +; GCN-NEXT: v_readlane_b32 s48, v2, 28 +; GCN-NEXT: v_readlane_b32 s49, v2, 29 +; GCN-NEXT: v_readlane_b32 s50, v2, 30 +; GCN-NEXT: v_readlane_b32 s51, v2, 31 +; GCN-NEXT: v_readlane_b32 s52, v2, 16 +; GCN-NEXT: v_readlane_b32 s53, v2, 17 +; GCN-NEXT: v_readlane_b32 s54, v2, 18 +; GCN-NEXT: v_readlane_b32 s55, v2, 19 +; GCN-NEXT: v_readlane_b32 s56, v2, 20 +; GCN-NEXT: v_readlane_b32 s57, v2, 21 +; GCN-NEXT: v_readlane_b32 s58, v2, 22 +; GCN-NEXT: v_readlane_b32 s59, v2, 23 +; GCN-NEXT: v_readlane_b32 s60, v2, 8 +; GCN-NEXT: v_readlane_b32 s61, v2, 9 +; GCN-NEXT: v_readlane_b32 s62, v2, 10 +; GCN-NEXT: v_readlane_b32 s63, v2, 11 +; GCN-NEXT: v_readlane_b32 s64, v2, 12 +; GCN-NEXT: v_readlane_b32 s65, v2, 13 +; GCN-NEXT: v_readlane_b32 s66, v2, 14 +; GCN-NEXT: v_readlane_b32 s67, v2, 15 +; GCN-NEXT: v_readlane_b32 s68, v2, 0 +; GCN-NEXT: v_readlane_b32 s69, v2, 1 +; GCN-NEXT: v_readlane_b32 s70, v2, 2 +; GCN-NEXT: v_readlane_b32 s71, v2, 3 +; GCN-NEXT: v_readlane_b32 s72, v2, 4 +; GCN-NEXT: v_readlane_b32 s73, v2, 5 +; GCN-NEXT: v_readlane_b32 s74, v2, 6 +; GCN-NEXT: v_readlane_b32 s75, v2, 7 +; GCN-NEXT: v_readlane_b32 s76, v1, 56 +; GCN-NEXT: v_readlane_b32 s77, v1, 57 +; GCN-NEXT: v_readlane_b32 s78, v1, 58 +; GCN-NEXT: v_readlane_b32 s79, v1, 59 +; GCN-NEXT: v_readlane_b32 s80, v1, 60 +; GCN-NEXT: v_readlane_b32 s81, v1, 61 +; GCN-NEXT: v_readlane_b32 s82, v1, 62 +; GCN-NEXT: v_readlane_b32 s83, v1, 63 +; GCN-NEXT: v_readlane_b32 s84, v1, 48 +; GCN-NEXT: v_readlane_b32 s85, v1, 49 +; GCN-NEXT: v_readlane_b32 s86, v1, 50 +; GCN-NEXT: v_readlane_b32 s87, v1, 51 +; GCN-NEXT: v_readlane_b32 s88, v1, 52 +; GCN-NEXT: v_readlane_b32 s89, v1, 53 +; GCN-NEXT: v_readlane_b32 s90, v1, 54 +; GCN-NEXT: v_readlane_b32 s91, v1, 55 +; GCN-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: v_readlane_b32 s2, v1, 2 +; GCN-NEXT: v_readlane_b32 s3, v1, 3 +; GCN-NEXT: v_readlane_b32 s4, v1, 4 +; GCN-NEXT: v_readlane_b32 s5, v1, 5 +; GCN-NEXT: v_readlane_b32 s6, v1, 6 +; GCN-NEXT: v_readlane_b32 s7, v1, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 8 -; GCN-NEXT: v_readlane_b32 s1, v0, 9 -; GCN-NEXT: v_readlane_b32 s2, v0, 10 -; GCN-NEXT: v_readlane_b32 s3, v0, 11 -; GCN-NEXT: v_readlane_b32 s4, v0, 12 -; GCN-NEXT: v_readlane_b32 s5, v0, 13 -; GCN-NEXT: v_readlane_b32 s6, v0, 14 -; GCN-NEXT: v_readlane_b32 s7, v0, 15 +; GCN-NEXT: v_readlane_b32 s0, v1, 8 +; GCN-NEXT: v_readlane_b32 s1, v1, 9 +; GCN-NEXT: v_readlane_b32 s2, v1, 10 +; GCN-NEXT: v_readlane_b32 s3, v1, 11 +; GCN-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-NEXT: v_readlane_b32 s5, v1, 13 +; GCN-NEXT: v_readlane_b32 s6, v1, 14 +; GCN-NEXT: v_readlane_b32 s7, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 16 -; GCN-NEXT: v_readlane_b32 s1, v0, 17 -; GCN-NEXT: v_readlane_b32 s2, v0, 18 -; GCN-NEXT: v_readlane_b32 s3, v0, 19 -; GCN-NEXT: v_readlane_b32 s4, v0, 20 -; GCN-NEXT: v_readlane_b32 s5, v0, 21 -; GCN-NEXT: v_readlane_b32 s6, v0, 22 -; GCN-NEXT: v_readlane_b32 s7, v0, 23 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 24 -; GCN-NEXT: v_readlane_b32 s1, v0, 25 -; GCN-NEXT: v_readlane_b32 s2, v0, 26 -; GCN-NEXT: v_readlane_b32 s3, v0, 27 -; GCN-NEXT: v_readlane_b32 s4, v0, 28 -; GCN-NEXT: v_readlane_b32 s5, v0, 29 -; GCN-NEXT: v_readlane_b32 s6, v0, 30 -; GCN-NEXT: v_readlane_b32 s7, v0, 31 +; GCN-NEXT: v_readlane_b32 s0, v1, 24 +; GCN-NEXT: v_readlane_b32 s1, v1, 25 +; GCN-NEXT: v_readlane_b32 s2, v1, 26 +; GCN-NEXT: v_readlane_b32 s3, v1, 27 +; GCN-NEXT: v_readlane_b32 s4, v1, 28 +; GCN-NEXT: v_readlane_b32 s5, v1, 29 +; GCN-NEXT: v_readlane_b32 s6, v1, 30 +; GCN-NEXT: v_readlane_b32 s7, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 32 -; GCN-NEXT: v_readlane_b32 s1, v0, 33 -; GCN-NEXT: v_readlane_b32 s2, v0, 34 -; GCN-NEXT: v_readlane_b32 s3, v0, 35 -; GCN-NEXT: v_readlane_b32 s4, v0, 36 -; GCN-NEXT: v_readlane_b32 s5, v0, 37 -; GCN-NEXT: v_readlane_b32 s6, v0, 38 -; GCN-NEXT: v_readlane_b32 s7, v0, 39 +; GCN-NEXT: v_readlane_b32 s0, v1, 32 +; GCN-NEXT: v_readlane_b32 s1, v1, 33 +; GCN-NEXT: v_readlane_b32 s2, v1, 34 +; GCN-NEXT: v_readlane_b32 s3, v1, 35 +; GCN-NEXT: v_readlane_b32 s4, v1, 36 +; GCN-NEXT: v_readlane_b32 s5, v1, 37 +; GCN-NEXT: v_readlane_b32 s6, v1, 38 +; GCN-NEXT: v_readlane_b32 s7, v1, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 40 -; GCN-NEXT: v_readlane_b32 s1, v0, 41 -; GCN-NEXT: v_readlane_b32 s2, v0, 42 -; GCN-NEXT: v_readlane_b32 s3, v0, 43 -; GCN-NEXT: v_readlane_b32 s4, v0, 44 -; GCN-NEXT: v_readlane_b32 s5, v0, 45 -; GCN-NEXT: v_readlane_b32 s6, v0, 46 -; GCN-NEXT: v_readlane_b32 s7, v0, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 40 +; GCN-NEXT: v_readlane_b32 s1, v1, 41 +; GCN-NEXT: v_readlane_b32 s2, v1, 42 +; GCN-NEXT: v_readlane_b32 s3, v1, 43 +; GCN-NEXT: v_readlane_b32 s4, v1, 44 +; GCN-NEXT: v_readlane_b32 s5, v1, 45 +; GCN-NEXT: v_readlane_b32 s6, v1, 46 +; GCN-NEXT: v_readlane_b32 s7, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v2, 0 -; GCN-NEXT: v_readlane_b32 s1, v2, 1 -; GCN-NEXT: v_readlane_b32 s2, v2, 2 -; GCN-NEXT: v_readlane_b32 s3, v2, 3 -; GCN-NEXT: v_readlane_b32 s4, v2, 4 -; GCN-NEXT: v_readlane_b32 s5, v2, 5 -; GCN-NEXT: v_readlane_b32 s6, v2, 6 -; GCN-NEXT: v_readlane_b32 s7, v2, 7 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[84:91] ; GCN-NEXT: ;;#ASMEND @@ -442,10 +458,17 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 { ; GCN-LABEL: split_sgpr_spill_2_vgprs: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s28, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s29, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s30, -1 +; GCN-NEXT: s_mov_b32 s31, 0xe8f000 +; GCN-NEXT: s_add_u32 s28, s28, s3 +; GCN-NEXT: s_addc_u32 s29, s29, 0 ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -519,27 +542,33 @@ ; GCN-NEXT: v_writelane_b32 v0, s17, 61 ; GCN-NEXT: v_writelane_b32 v0, s18, 62 ; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s2, 8 -; GCN-NEXT: v_writelane_b32 v1, s3, 9 +; GCN-NEXT: v_writelane_b32 v0, s2, 8 +; GCN-NEXT: v_writelane_b32 v0, s3, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 ; GCN-NEXT: v_readlane_b32 s17, v1, 9 ; GCN-NEXT: v_readlane_b32 s20, v1, 0 @@ -685,176 +714,168 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 0 -; GCN-NEXT: v_writelane_b32 v31, s5, 1 -; GCN-NEXT: v_writelane_b32 v31, s6, 2 -; GCN-NEXT: v_writelane_b32 v31, s7, 3 -; GCN-NEXT: v_writelane_b32 v31, s8, 4 -; GCN-NEXT: v_writelane_b32 v31, s9, 5 -; GCN-NEXT: v_writelane_b32 v31, s10, 6 -; GCN-NEXT: v_writelane_b32 v31, s11, 7 -; GCN-NEXT: v_writelane_b32 v31, s12, 8 -; GCN-NEXT: v_writelane_b32 v31, s13, 9 -; GCN-NEXT: v_writelane_b32 v31, s14, 10 -; GCN-NEXT: v_writelane_b32 v31, s15, 11 -; GCN-NEXT: v_writelane_b32 v31, s16, 12 -; GCN-NEXT: v_writelane_b32 v31, s17, 13 -; GCN-NEXT: v_writelane_b32 v31, s18, 14 -; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 16 -; GCN-NEXT: v_writelane_b32 v31, s5, 17 -; GCN-NEXT: v_writelane_b32 v31, s6, 18 -; GCN-NEXT: v_writelane_b32 v31, s7, 19 -; GCN-NEXT: v_writelane_b32 v31, s8, 20 -; GCN-NEXT: v_writelane_b32 v31, s9, 21 -; GCN-NEXT: v_writelane_b32 v31, s10, 22 -; GCN-NEXT: v_writelane_b32 v31, s11, 23 -; GCN-NEXT: v_writelane_b32 v31, s12, 24 -; GCN-NEXT: v_writelane_b32 v31, s13, 25 -; GCN-NEXT: v_writelane_b32 v31, s14, 26 -; GCN-NEXT: v_writelane_b32 v31, s15, 27 -; GCN-NEXT: v_writelane_b32 v31, s16, 28 -; GCN-NEXT: v_writelane_b32 v31, s17, 29 -; GCN-NEXT: v_writelane_b32 v31, s18, 30 -; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 32 -; GCN-NEXT: v_writelane_b32 v31, s5, 33 -; GCN-NEXT: v_writelane_b32 v31, s6, 34 -; GCN-NEXT: v_writelane_b32 v31, s7, 35 -; GCN-NEXT: v_writelane_b32 v31, s8, 36 -; GCN-NEXT: v_writelane_b32 v31, s9, 37 -; GCN-NEXT: v_writelane_b32 v31, s10, 38 -; GCN-NEXT: v_writelane_b32 v31, s11, 39 -; GCN-NEXT: v_writelane_b32 v31, s12, 40 -; GCN-NEXT: v_writelane_b32 v31, s13, 41 -; GCN-NEXT: v_writelane_b32 v31, s14, 42 -; GCN-NEXT: v_writelane_b32 v31, s15, 43 -; GCN-NEXT: v_writelane_b32 v31, s16, 44 -; GCN-NEXT: v_writelane_b32 v31, s17, 45 -; GCN-NEXT: v_writelane_b32 v31, s18, 46 -; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 48 -; GCN-NEXT: v_writelane_b32 v31, s5, 49 -; GCN-NEXT: v_writelane_b32 v31, s6, 50 -; GCN-NEXT: v_writelane_b32 v31, s7, 51 -; GCN-NEXT: v_writelane_b32 v31, s8, 52 -; GCN-NEXT: v_writelane_b32 v31, s9, 53 -; GCN-NEXT: v_writelane_b32 v31, s10, 54 -; GCN-NEXT: v_writelane_b32 v31, s11, 55 -; GCN-NEXT: v_writelane_b32 v31, s12, 56 -; GCN-NEXT: v_writelane_b32 v31, s13, 57 -; GCN-NEXT: v_writelane_b32 v31, s14, 58 -; GCN-NEXT: v_writelane_b32 v31, s15, 59 -; GCN-NEXT: v_writelane_b32 v31, s16, 60 -; GCN-NEXT: v_writelane_b32 v31, s17, 61 -; GCN-NEXT: v_writelane_b32 v31, s18, 62 -; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s36, v31, 32 -; GCN-NEXT: v_readlane_b32 s37, v31, 33 -; GCN-NEXT: v_readlane_b32 s38, v31, 34 -; GCN-NEXT: v_readlane_b32 s39, v31, 35 -; GCN-NEXT: v_readlane_b32 s40, v31, 36 -; GCN-NEXT: v_readlane_b32 s41, v31, 37 -; GCN-NEXT: v_readlane_b32 s42, v31, 38 -; GCN-NEXT: v_readlane_b32 s43, v31, 39 -; GCN-NEXT: v_readlane_b32 s44, v31, 40 -; GCN-NEXT: v_readlane_b32 s45, v31, 41 -; GCN-NEXT: v_readlane_b32 s46, v31, 42 -; GCN-NEXT: v_readlane_b32 s47, v31, 43 -; GCN-NEXT: v_readlane_b32 s48, v31, 44 -; GCN-NEXT: v_readlane_b32 s49, v31, 45 -; GCN-NEXT: v_readlane_b32 s50, v31, 46 -; GCN-NEXT: v_readlane_b32 s51, v31, 47 -; GCN-NEXT: v_readlane_b32 s0, v31, 16 -; GCN-NEXT: v_readlane_b32 s1, v31, 17 -; GCN-NEXT: v_readlane_b32 s2, v31, 18 -; GCN-NEXT: v_readlane_b32 s3, v31, 19 -; GCN-NEXT: v_readlane_b32 s4, v31, 20 -; GCN-NEXT: v_readlane_b32 s5, v31, 21 -; GCN-NEXT: v_readlane_b32 s6, v31, 22 -; GCN-NEXT: v_readlane_b32 s7, v31, 23 -; GCN-NEXT: v_readlane_b32 s8, v31, 24 -; GCN-NEXT: v_readlane_b32 s9, v31, 25 -; GCN-NEXT: v_readlane_b32 s10, v31, 26 -; GCN-NEXT: v_readlane_b32 s11, v31, 27 -; GCN-NEXT: v_readlane_b32 s12, v31, 28 -; GCN-NEXT: v_readlane_b32 s13, v31, 29 -; GCN-NEXT: v_readlane_b32 s14, v31, 30 -; GCN-NEXT: v_readlane_b32 s15, v31, 31 -; GCN-NEXT: v_readlane_b32 s16, v31, 0 -; GCN-NEXT: v_readlane_b32 s17, v31, 1 -; GCN-NEXT: v_readlane_b32 s18, v31, 2 -; GCN-NEXT: v_readlane_b32 s19, v31, 3 -; GCN-NEXT: v_readlane_b32 s20, v31, 4 -; GCN-NEXT: v_readlane_b32 s21, v31, 5 -; GCN-NEXT: v_readlane_b32 s22, v31, 6 -; GCN-NEXT: v_readlane_b32 s23, v31, 7 -; GCN-NEXT: v_readlane_b32 s24, v31, 8 -; GCN-NEXT: v_readlane_b32 s25, v31, 9 -; GCN-NEXT: v_readlane_b32 s26, v31, 10 -; GCN-NEXT: v_readlane_b32 s27, v31, 11 -; GCN-NEXT: v_readlane_b32 s28, v31, 12 -; GCN-NEXT: v_readlane_b32 s29, v31, 13 -; GCN-NEXT: v_readlane_b32 s30, v31, 14 -; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s36, v1, 32 +; GCN-NEXT: v_readlane_b32 s37, v1, 33 +; GCN-NEXT: v_readlane_b32 s38, v1, 34 +; GCN-NEXT: v_readlane_b32 s39, v1, 35 +; GCN-NEXT: v_readlane_b32 s40, v1, 36 +; GCN-NEXT: v_readlane_b32 s41, v1, 37 +; GCN-NEXT: v_readlane_b32 s42, v1, 38 +; GCN-NEXT: v_readlane_b32 s43, v1, 39 +; GCN-NEXT: v_readlane_b32 s44, v1, 40 +; GCN-NEXT: v_readlane_b32 s45, v1, 41 +; GCN-NEXT: v_readlane_b32 s46, v1, 42 +; GCN-NEXT: v_readlane_b32 s47, v1, 43 +; GCN-NEXT: v_readlane_b32 s48, v1, 44 +; GCN-NEXT: v_readlane_b32 s49, v1, 45 +; GCN-NEXT: v_readlane_b32 s50, v1, 46 +; GCN-NEXT: v_readlane_b32 s51, v1, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 +; GCN-NEXT: v_readlane_b32 s8, v1, 24 +; GCN-NEXT: v_readlane_b32 s9, v1, 25 +; GCN-NEXT: v_readlane_b32 s10, v1, 26 +; GCN-NEXT: v_readlane_b32 s11, v1, 27 +; GCN-NEXT: v_readlane_b32 s12, v1, 28 +; GCN-NEXT: v_readlane_b32 s13, v1, 29 +; GCN-NEXT: v_readlane_b32 s14, v1, 30 +; GCN-NEXT: v_readlane_b32 s15, v1, 31 +; GCN-NEXT: v_readlane_b32 s16, v1, 0 +; GCN-NEXT: v_readlane_b32 s17, v1, 1 +; GCN-NEXT: v_readlane_b32 s18, v1, 2 +; GCN-NEXT: v_readlane_b32 s19, v1, 3 +; GCN-NEXT: v_readlane_b32 s20, v1, 4 +; GCN-NEXT: v_readlane_b32 s21, v1, 5 +; GCN-NEXT: v_readlane_b32 s22, v1, 6 +; GCN-NEXT: v_readlane_b32 s23, v1, 7 +; GCN-NEXT: v_readlane_b32 s24, v1, 8 +; GCN-NEXT: v_readlane_b32 s25, v1, 9 +; GCN-NEXT: v_readlane_b32 s26, v1, 10 +; GCN-NEXT: v_readlane_b32 s27, v1, 11 +; GCN-NEXT: v_readlane_b32 s28, v1, 12 +; GCN-NEXT: v_readlane_b32 s29, v1, 13 +; GCN-NEXT: v_readlane_b32 s30, v1, 14 +; GCN-NEXT: v_readlane_b32 s31, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v31, 48 -; GCN-NEXT: v_readlane_b32 s5, v31, 49 -; GCN-NEXT: v_readlane_b32 s6, v31, 50 -; GCN-NEXT: v_readlane_b32 s7, v31, 51 -; GCN-NEXT: v_readlane_b32 s8, v31, 52 -; GCN-NEXT: v_readlane_b32 s9, v31, 53 -; GCN-NEXT: v_readlane_b32 s10, v31, 54 -; GCN-NEXT: v_readlane_b32 s11, v31, 55 -; GCN-NEXT: v_readlane_b32 s12, v31, 56 -; GCN-NEXT: v_readlane_b32 s13, v31, 57 -; GCN-NEXT: v_readlane_b32 s14, v31, 58 -; GCN-NEXT: v_readlane_b32 s15, v31, 59 -; GCN-NEXT: v_readlane_b32 s16, v31, 60 -; GCN-NEXT: v_readlane_b32 s17, v31, 61 -; GCN-NEXT: v_readlane_b32 s18, v31, 62 -; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 48 +; GCN-NEXT: v_readlane_b32 s5, v1, 49 +; GCN-NEXT: v_readlane_b32 s6, v1, 50 +; GCN-NEXT: v_readlane_b32 s7, v1, 51 +; GCN-NEXT: v_readlane_b32 s8, v1, 52 +; GCN-NEXT: v_readlane_b32 s9, v1, 53 +; GCN-NEXT: v_readlane_b32 s10, v1, 54 +; GCN-NEXT: v_readlane_b32 s11, v1, 55 +; GCN-NEXT: v_readlane_b32 s12, v1, 56 +; GCN-NEXT: v_readlane_b32 s13, v1, 57 +; GCN-NEXT: v_readlane_b32 s14, v1, 58 +; GCN-NEXT: v_readlane_b32 s15, v1, 59 +; GCN-NEXT: v_readlane_b32 s16, v1, 60 +; GCN-NEXT: v_readlane_b32 s17, v1, 61 +; GCN-NEXT: v_readlane_b32 s18, v1, 62 +; GCN-NEXT: v_readlane_b32 s19, v1, 63 ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -920,144 +941,144 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 0 -; GCN-NEXT: v_writelane_b32 v31, s5, 1 -; GCN-NEXT: v_writelane_b32 v31, s6, 2 -; GCN-NEXT: v_writelane_b32 v31, s7, 3 -; GCN-NEXT: v_writelane_b32 v31, s8, 4 -; GCN-NEXT: v_writelane_b32 v31, s9, 5 -; GCN-NEXT: v_writelane_b32 v31, s10, 6 -; GCN-NEXT: v_writelane_b32 v31, s11, 7 -; GCN-NEXT: v_writelane_b32 v31, s12, 8 -; GCN-NEXT: v_writelane_b32 v31, s13, 9 -; GCN-NEXT: v_writelane_b32 v31, s14, 10 -; GCN-NEXT: v_writelane_b32 v31, s15, 11 -; GCN-NEXT: v_writelane_b32 v31, s16, 12 -; GCN-NEXT: v_writelane_b32 v31, s17, 13 -; GCN-NEXT: v_writelane_b32 v31, s18, 14 -; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 16 -; GCN-NEXT: v_writelane_b32 v31, s5, 17 -; GCN-NEXT: v_writelane_b32 v31, s6, 18 -; GCN-NEXT: v_writelane_b32 v31, s7, 19 -; GCN-NEXT: v_writelane_b32 v31, s8, 20 -; GCN-NEXT: v_writelane_b32 v31, s9, 21 -; GCN-NEXT: v_writelane_b32 v31, s10, 22 -; GCN-NEXT: v_writelane_b32 v31, s11, 23 -; GCN-NEXT: v_writelane_b32 v31, s12, 24 -; GCN-NEXT: v_writelane_b32 v31, s13, 25 -; GCN-NEXT: v_writelane_b32 v31, s14, 26 -; GCN-NEXT: v_writelane_b32 v31, s15, 27 -; GCN-NEXT: v_writelane_b32 v31, s16, 28 -; GCN-NEXT: v_writelane_b32 v31, s17, 29 -; GCN-NEXT: v_writelane_b32 v31, s18, 30 -; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 32 -; GCN-NEXT: v_writelane_b32 v31, s5, 33 -; GCN-NEXT: v_writelane_b32 v31, s6, 34 -; GCN-NEXT: v_writelane_b32 v31, s7, 35 -; GCN-NEXT: v_writelane_b32 v31, s8, 36 -; GCN-NEXT: v_writelane_b32 v31, s9, 37 -; GCN-NEXT: v_writelane_b32 v31, s10, 38 -; GCN-NEXT: v_writelane_b32 v31, s11, 39 -; GCN-NEXT: v_writelane_b32 v31, s12, 40 -; GCN-NEXT: v_writelane_b32 v31, s13, 41 -; GCN-NEXT: v_writelane_b32 v31, s14, 42 -; GCN-NEXT: v_writelane_b32 v31, s15, 43 -; GCN-NEXT: v_writelane_b32 v31, s16, 44 -; GCN-NEXT: v_writelane_b32 v31, s17, 45 -; GCN-NEXT: v_writelane_b32 v31, s18, 46 -; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 48 -; GCN-NEXT: v_writelane_b32 v31, s5, 49 -; GCN-NEXT: v_writelane_b32 v31, s6, 50 -; GCN-NEXT: v_writelane_b32 v31, s7, 51 -; GCN-NEXT: v_writelane_b32 v31, s8, 52 -; GCN-NEXT: v_writelane_b32 v31, s9, 53 -; GCN-NEXT: v_writelane_b32 v31, s10, 54 -; GCN-NEXT: v_writelane_b32 v31, s11, 55 -; GCN-NEXT: v_writelane_b32 v31, s12, 56 -; GCN-NEXT: v_writelane_b32 v31, s13, 57 -; GCN-NEXT: v_writelane_b32 v31, s14, 58 -; GCN-NEXT: v_writelane_b32 v31, s15, 59 -; GCN-NEXT: v_writelane_b32 v31, s16, 60 -; GCN-NEXT: v_writelane_b32 v31, s17, 61 -; GCN-NEXT: v_writelane_b32 v31, s18, 62 -; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s36, v31, 32 -; GCN-NEXT: v_readlane_b32 s37, v31, 33 -; GCN-NEXT: v_readlane_b32 s38, v31, 34 -; GCN-NEXT: v_readlane_b32 s39, v31, 35 -; GCN-NEXT: v_readlane_b32 s40, v31, 36 -; GCN-NEXT: v_readlane_b32 s41, v31, 37 -; GCN-NEXT: v_readlane_b32 s42, v31, 38 -; GCN-NEXT: v_readlane_b32 s43, v31, 39 -; GCN-NEXT: v_readlane_b32 s44, v31, 40 -; GCN-NEXT: v_readlane_b32 s45, v31, 41 -; GCN-NEXT: v_readlane_b32 s46, v31, 42 -; GCN-NEXT: v_readlane_b32 s47, v31, 43 -; GCN-NEXT: v_readlane_b32 s48, v31, 44 -; GCN-NEXT: v_readlane_b32 s49, v31, 45 -; GCN-NEXT: v_readlane_b32 s50, v31, 46 -; GCN-NEXT: v_readlane_b32 s51, v31, 47 -; GCN-NEXT: v_readlane_b32 s0, v31, 16 -; GCN-NEXT: v_readlane_b32 s1, v31, 17 -; GCN-NEXT: v_readlane_b32 s2, v31, 18 -; GCN-NEXT: v_readlane_b32 s3, v31, 19 -; GCN-NEXT: v_readlane_b32 s4, v31, 20 -; GCN-NEXT: v_readlane_b32 s5, v31, 21 -; GCN-NEXT: v_readlane_b32 s6, v31, 22 -; GCN-NEXT: v_readlane_b32 s7, v31, 23 -; GCN-NEXT: v_readlane_b32 s8, v31, 24 -; GCN-NEXT: v_readlane_b32 s9, v31, 25 -; GCN-NEXT: v_readlane_b32 s10, v31, 26 -; GCN-NEXT: v_readlane_b32 s11, v31, 27 -; GCN-NEXT: v_readlane_b32 s12, v31, 28 -; GCN-NEXT: v_readlane_b32 s13, v31, 29 -; GCN-NEXT: v_readlane_b32 s14, v31, 30 -; GCN-NEXT: v_readlane_b32 s15, v31, 31 -; GCN-NEXT: v_readlane_b32 s16, v31, 0 -; GCN-NEXT: v_readlane_b32 s17, v31, 1 -; GCN-NEXT: v_readlane_b32 s18, v31, 2 -; GCN-NEXT: v_readlane_b32 s19, v31, 3 -; GCN-NEXT: v_readlane_b32 s20, v31, 4 -; GCN-NEXT: v_readlane_b32 s21, v31, 5 -; GCN-NEXT: v_readlane_b32 s22, v31, 6 -; GCN-NEXT: v_readlane_b32 s23, v31, 7 -; GCN-NEXT: v_readlane_b32 s24, v31, 8 -; GCN-NEXT: v_readlane_b32 s25, v31, 9 -; GCN-NEXT: v_readlane_b32 s26, v31, 10 -; GCN-NEXT: v_readlane_b32 s27, v31, 11 -; GCN-NEXT: v_readlane_b32 s28, v31, 12 -; GCN-NEXT: v_readlane_b32 s29, v31, 13 -; GCN-NEXT: v_readlane_b32 s30, v31, 14 -; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s36, v2, 32 +; GCN-NEXT: v_readlane_b32 s37, v2, 33 +; GCN-NEXT: v_readlane_b32 s38, v2, 34 +; GCN-NEXT: v_readlane_b32 s39, v2, 35 +; GCN-NEXT: v_readlane_b32 s40, v2, 36 +; GCN-NEXT: v_readlane_b32 s41, v2, 37 +; GCN-NEXT: v_readlane_b32 s42, v2, 38 +; GCN-NEXT: v_readlane_b32 s43, v2, 39 +; GCN-NEXT: v_readlane_b32 s44, v2, 40 +; GCN-NEXT: v_readlane_b32 s45, v2, 41 +; GCN-NEXT: v_readlane_b32 s46, v2, 42 +; GCN-NEXT: v_readlane_b32 s47, v2, 43 +; GCN-NEXT: v_readlane_b32 s48, v2, 44 +; GCN-NEXT: v_readlane_b32 s49, v2, 45 +; GCN-NEXT: v_readlane_b32 s50, v2, 46 +; GCN-NEXT: v_readlane_b32 s51, v2, 47 +; GCN-NEXT: v_readlane_b32 s0, v2, 16 +; GCN-NEXT: v_readlane_b32 s1, v2, 17 +; GCN-NEXT: v_readlane_b32 s2, v2, 18 +; GCN-NEXT: v_readlane_b32 s3, v2, 19 +; GCN-NEXT: v_readlane_b32 s4, v2, 20 +; GCN-NEXT: v_readlane_b32 s5, v2, 21 +; GCN-NEXT: v_readlane_b32 s6, v2, 22 +; GCN-NEXT: v_readlane_b32 s7, v2, 23 +; GCN-NEXT: v_readlane_b32 s8, v2, 24 +; GCN-NEXT: v_readlane_b32 s9, v2, 25 +; GCN-NEXT: v_readlane_b32 s10, v2, 26 +; GCN-NEXT: v_readlane_b32 s11, v2, 27 +; GCN-NEXT: v_readlane_b32 s12, v2, 28 +; GCN-NEXT: v_readlane_b32 s13, v2, 29 +; GCN-NEXT: v_readlane_b32 s14, v2, 30 +; GCN-NEXT: v_readlane_b32 s15, v2, 31 +; GCN-NEXT: v_readlane_b32 s16, v2, 0 +; GCN-NEXT: v_readlane_b32 s17, v2, 1 +; GCN-NEXT: v_readlane_b32 s18, v2, 2 +; GCN-NEXT: v_readlane_b32 s19, v2, 3 +; GCN-NEXT: v_readlane_b32 s20, v2, 4 +; GCN-NEXT: v_readlane_b32 s21, v2, 5 +; GCN-NEXT: v_readlane_b32 s22, v2, 6 +; GCN-NEXT: v_readlane_b32 s23, v2, 7 +; GCN-NEXT: v_readlane_b32 s24, v2, 8 +; GCN-NEXT: v_readlane_b32 s25, v2, 9 +; GCN-NEXT: v_readlane_b32 s26, v2, 10 +; GCN-NEXT: v_readlane_b32 s27, v2, 11 +; GCN-NEXT: v_readlane_b32 s28, v2, 12 +; GCN-NEXT: v_readlane_b32 s29, v2, 13 +; GCN-NEXT: v_readlane_b32 s30, v2, 14 +; GCN-NEXT: v_readlane_b32 s31, v2, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 ; GCN-NEXT: ;;#ASMEND @@ -1067,32 +1088,24 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v31, 48 -; GCN-NEXT: v_readlane_b32 s5, v31, 49 -; GCN-NEXT: v_readlane_b32 s6, v31, 50 -; GCN-NEXT: v_readlane_b32 s7, v31, 51 -; GCN-NEXT: v_readlane_b32 s8, v31, 52 -; GCN-NEXT: v_readlane_b32 s9, v31, 53 -; GCN-NEXT: v_readlane_b32 s10, v31, 54 -; GCN-NEXT: v_readlane_b32 s11, v31, 55 -; GCN-NEXT: v_readlane_b32 s12, v31, 56 -; GCN-NEXT: v_readlane_b32 s13, v31, 57 -; GCN-NEXT: v_readlane_b32 s14, v31, 58 -; GCN-NEXT: v_readlane_b32 s15, v31, 59 -; GCN-NEXT: v_readlane_b32 s16, v31, 60 -; GCN-NEXT: v_readlane_b32 s17, v31, 61 -; GCN-NEXT: v_readlane_b32 s18, v31, 62 -; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v2, 48 +; GCN-NEXT: v_readlane_b32 s5, v2, 49 +; GCN-NEXT: v_readlane_b32 s6, v2, 50 +; GCN-NEXT: v_readlane_b32 s7, v2, 51 +; GCN-NEXT: v_readlane_b32 s8, v2, 52 +; GCN-NEXT: v_readlane_b32 s9, v2, 53 +; GCN-NEXT: v_readlane_b32 s10, v2, 54 +; GCN-NEXT: v_readlane_b32 s11, v2, 55 +; GCN-NEXT: v_readlane_b32 s12, v2, 56 +; GCN-NEXT: v_readlane_b32 s13, v2, 57 +; GCN-NEXT: v_readlane_b32 s14, v2, 58 +; GCN-NEXT: v_readlane_b32 s15, v2, 59 +; GCN-NEXT: v_readlane_b32 s16, v2, 60 +; GCN-NEXT: v_readlane_b32 s17, v2, 61 +; GCN-NEXT: v_readlane_b32 s18, v2, 62 +; GCN-NEXT: v_readlane_b32 s19, v2, 63 ; GCN-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -1,22 +1,377 @@ -; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -; This ends up needing to spill SGPRs to memory, and also does not -; have any free SGPRs available to save the exec mask when doing so. -; The register scavenger also needs to use the emergency stack slot, -; which tries to place the scavenged register restore instruction as -; far the block as possible, near the terminator. This places a -; restore instruction between the condition and the conditional -; branch, which gets expanded into a sequence involving s_not_b64 on -; the exec mask, clobbering SCC value before the branch. We probably -; have to stop relying on being able to flip and restore the exec -; mask, and always require a free SGPR for saving exec. +; This was a negative test to catch an extreme case when all options are exhausted +; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs +; the edge case won't arise and the test would always compile. -; CHECK: *** Bad machine code: Using an undefined physical register *** -; CHECK-NEXT: - function: kernel0 -; CHECK-NEXT: - basic block: %bb.0 -; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc -; CHECK-NEXT: - operand 1: implicit killed $scc define amdgpu_kernel void @kernel0(i32 addrspace(1)* %out, i32 %in) #1 { +; CHECK-LABEL: kernel0: +; CHECK: ; %bb.0: +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: v_writelane_b32 v0, s2, 0 +; CHECK-NEXT: v_writelane_b32 v0, s3, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 2 +; CHECK-NEXT: v_writelane_b32 v0, s5, 3 +; CHECK-NEXT: v_writelane_b32 v0, s6, 4 +; CHECK-NEXT: v_writelane_b32 v0, s7, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:19] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 14 +; CHECK-NEXT: v_writelane_b32 v0, s5, 15 +; CHECK-NEXT: v_writelane_b32 v0, s6, 16 +; CHECK-NEXT: v_writelane_b32 v0, s7, 17 +; CHECK-NEXT: v_writelane_b32 v0, s8, 18 +; CHECK-NEXT: v_writelane_b32 v0, s9, 19 +; CHECK-NEXT: v_writelane_b32 v0, s10, 20 +; CHECK-NEXT: v_writelane_b32 v0, s11, 21 +; CHECK-NEXT: v_writelane_b32 v0, s12, 22 +; CHECK-NEXT: v_writelane_b32 v0, s13, 23 +; CHECK-NEXT: v_writelane_b32 v0, s14, 24 +; CHECK-NEXT: v_writelane_b32 v0, s15, 25 +; CHECK-NEXT: v_writelane_b32 v0, s16, 26 +; CHECK-NEXT: v_writelane_b32 v0, s17, 27 +; CHECK-NEXT: v_writelane_b32 v0, s18, 28 +; CHECK-NEXT: v_writelane_b32 v0, s19, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s2, 30 +; CHECK-NEXT: v_writelane_b32 v0, s3, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 32 +; CHECK-NEXT: v_writelane_b32 v0, s5, 33 +; CHECK-NEXT: v_writelane_b32 v0, s6, 34 +; CHECK-NEXT: v_writelane_b32 v0, s7, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 36 +; CHECK-NEXT: v_writelane_b32 v0, s5, 37 +; CHECK-NEXT: v_writelane_b32 v0, s6, 38 +; CHECK-NEXT: v_writelane_b32 v0, s7, 39 +; CHECK-NEXT: v_writelane_b32 v0, s8, 40 +; CHECK-NEXT: v_writelane_b32 v0, s9, 41 +; CHECK-NEXT: v_writelane_b32 v0, s10, 42 +; CHECK-NEXT: v_writelane_b32 v0, s11, 43 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[16:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[36:43] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 44 +; CHECK-NEXT: v_writelane_b32 v0, s1, 45 +; CHECK-NEXT: v_writelane_b32 v0, s2, 46 +; CHECK-NEXT: v_writelane_b32 v0, s3, 47 +; CHECK-NEXT: v_writelane_b32 v0, s4, 48 +; CHECK-NEXT: v_writelane_b32 v0, s5, 49 +; CHECK-NEXT: v_writelane_b32 v0, s6, 50 +; CHECK-NEXT: v_writelane_b32 v0, s7, 51 +; CHECK-NEXT: v_writelane_b32 v0, s8, 52 +; CHECK-NEXT: v_writelane_b32 v0, s9, 53 +; CHECK-NEXT: v_writelane_b32 v0, s10, 54 +; CHECK-NEXT: v_writelane_b32 v0, s11, 55 +; CHECK-NEXT: v_writelane_b32 v0, s12, 56 +; CHECK-NEXT: v_writelane_b32 v0, s13, 57 +; CHECK-NEXT: v_writelane_b32 v0, s14, 58 +; CHECK-NEXT: v_writelane_b32 v0, s15, 59 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[34:35] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: v_writelane_b32 v0, s0, 60 +; CHECK-NEXT: v_writelane_b32 v1, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s1, 61 +; CHECK-NEXT: v_writelane_b32 v1, s5, 1 +; CHECK-NEXT: v_writelane_b32 v0, s2, 62 +; CHECK-NEXT: v_writelane_b32 v1, s6, 2 +; CHECK-NEXT: v_writelane_b32 v0, s3, 63 +; CHECK-NEXT: v_writelane_b32 v1, s7, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 4 +; CHECK-NEXT: v_writelane_b32 v1, s1, 5 +; CHECK-NEXT: v_writelane_b32 v1, s2, 6 +; CHECK-NEXT: v_writelane_b32 v1, s3, 7 +; CHECK-NEXT: v_writelane_b32 v1, s4, 8 +; CHECK-NEXT: v_writelane_b32 v1, s5, 9 +; CHECK-NEXT: v_writelane_b32 v1, s6, 10 +; CHECK-NEXT: v_writelane_b32 v1, s7, 11 +; CHECK-NEXT: v_writelane_b32 v1, s8, 12 +; CHECK-NEXT: v_writelane_b32 v1, s9, 13 +; CHECK-NEXT: v_writelane_b32 v1, s10, 14 +; CHECK-NEXT: v_writelane_b32 v1, s11, 15 +; CHECK-NEXT: v_writelane_b32 v1, s12, 16 +; CHECK-NEXT: v_writelane_b32 v1, s13, 17 +; CHECK-NEXT: v_writelane_b32 v1, s14, 18 +; CHECK-NEXT: v_writelane_b32 v1, s15, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[54:55] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 20 +; CHECK-NEXT: v_writelane_b32 v1, s1, 21 +; CHECK-NEXT: v_writelane_b32 v1, s2, 22 +; CHECK-NEXT: v_writelane_b32 v1, s3, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 24 +; CHECK-NEXT: v_writelane_b32 v1, s1, 25 +; CHECK-NEXT: v_writelane_b32 v1, s2, 26 +; CHECK-NEXT: v_writelane_b32 v1, s3, 27 +; CHECK-NEXT: v_writelane_b32 v1, s4, 28 +; CHECK-NEXT: v_writelane_b32 v1, s5, 29 +; CHECK-NEXT: v_writelane_b32 v1, s6, 30 +; CHECK-NEXT: v_writelane_b32 v1, s7, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 32 +; CHECK-NEXT: v_writelane_b32 v1, s1, 33 +; CHECK-NEXT: v_writelane_b32 v1, s2, 34 +; CHECK-NEXT: v_writelane_b32 v1, s3, 35 +; CHECK-NEXT: v_writelane_b32 v1, s4, 36 +; CHECK-NEXT: v_writelane_b32 v1, s5, 37 +; CHECK-NEXT: v_writelane_b32 v1, s6, 38 +; CHECK-NEXT: v_writelane_b32 v1, s7, 39 +; CHECK-NEXT: v_writelane_b32 v1, s8, 40 +; CHECK-NEXT: v_writelane_b32 v1, s9, 41 +; CHECK-NEXT: v_writelane_b32 v1, s10, 42 +; CHECK-NEXT: v_writelane_b32 v1, s11, 43 +; CHECK-NEXT: v_writelane_b32 v1, s12, 44 +; CHECK-NEXT: v_writelane_b32 v1, s13, 45 +; CHECK-NEXT: v_writelane_b32 v1, s14, 46 +; CHECK-NEXT: v_writelane_b32 v1, s15, 47 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %ret +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB0_2: ; %bb0 +; CHECK-NEXT: v_readlane_b32 s0, v0, 0 +; CHECK-NEXT: v_readlane_b32 s1, v0, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 6 +; CHECK-NEXT: v_readlane_b32 s1, v0, 7 +; CHECK-NEXT: v_readlane_b32 s2, v0, 8 +; CHECK-NEXT: v_readlane_b32 s3, v0, 9 +; CHECK-NEXT: v_readlane_b32 s4, v0, 10 +; CHECK-NEXT: v_readlane_b32 s5, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 12 +; CHECK-NEXT: v_readlane_b32 s7, v0, 13 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 14 +; CHECK-NEXT: v_readlane_b32 s1, v0, 15 +; CHECK-NEXT: v_readlane_b32 s2, v0, 16 +; CHECK-NEXT: v_readlane_b32 s3, v0, 17 +; CHECK-NEXT: v_readlane_b32 s4, v0, 18 +; CHECK-NEXT: v_readlane_b32 s5, v0, 19 +; CHECK-NEXT: v_readlane_b32 s6, v0, 20 +; CHECK-NEXT: v_readlane_b32 s7, v0, 21 +; CHECK-NEXT: v_readlane_b32 s8, v0, 22 +; CHECK-NEXT: v_readlane_b32 s9, v0, 23 +; CHECK-NEXT: v_readlane_b32 s10, v0, 24 +; CHECK-NEXT: v_readlane_b32 s11, v0, 25 +; CHECK-NEXT: v_readlane_b32 s12, v0, 26 +; CHECK-NEXT: v_readlane_b32 s13, v0, 27 +; CHECK-NEXT: v_readlane_b32 s14, v0, 28 +; CHECK-NEXT: v_readlane_b32 s15, v0, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 30 +; CHECK-NEXT: v_readlane_b32 s1, v0, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 32 +; CHECK-NEXT: v_readlane_b32 s1, v0, 33 +; CHECK-NEXT: v_readlane_b32 s2, v0, 34 +; CHECK-NEXT: v_readlane_b32 s3, v0, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 36 +; CHECK-NEXT: v_readlane_b32 s1, v0, 37 +; CHECK-NEXT: v_readlane_b32 s2, v0, 38 +; CHECK-NEXT: v_readlane_b32 s3, v0, 39 +; CHECK-NEXT: v_readlane_b32 s4, v0, 40 +; CHECK-NEXT: v_readlane_b32 s5, v0, 41 +; CHECK-NEXT: v_readlane_b32 s6, v0, 42 +; CHECK-NEXT: v_readlane_b32 s7, v0, 43 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 44 +; CHECK-NEXT: v_readlane_b32 s1, v0, 45 +; CHECK-NEXT: v_readlane_b32 s2, v0, 46 +; CHECK-NEXT: v_readlane_b32 s3, v0, 47 +; CHECK-NEXT: v_readlane_b32 s4, v0, 48 +; CHECK-NEXT: v_readlane_b32 s5, v0, 49 +; CHECK-NEXT: v_readlane_b32 s6, v0, 50 +; CHECK-NEXT: v_readlane_b32 s7, v0, 51 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[16:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[36:43] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s8, v0, 52 +; CHECK-NEXT: v_readlane_b32 s9, v0, 53 +; CHECK-NEXT: v_readlane_b32 s10, v0, 54 +; CHECK-NEXT: v_readlane_b32 s11, v0, 55 +; CHECK-NEXT: v_readlane_b32 s12, v0, 56 +; CHECK-NEXT: v_readlane_b32 s13, v0, 57 +; CHECK-NEXT: v_readlane_b32 s14, v0, 58 +; CHECK-NEXT: v_readlane_b32 s15, v0, 59 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 60 +; CHECK-NEXT: v_readlane_b32 s1, v0, 61 +; CHECK-NEXT: v_readlane_b32 s2, v0, 62 +; CHECK-NEXT: v_readlane_b32 s3, v0, 63 +; CHECK-NEXT: v_readlane_b32 s4, v1, 0 +; CHECK-NEXT: v_readlane_b32 s5, v1, 1 +; CHECK-NEXT: v_readlane_b32 s6, v1, 2 +; CHECK-NEXT: v_readlane_b32 s7, v1, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[34:35] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 4 +; CHECK-NEXT: v_readlane_b32 s1, v1, 5 +; CHECK-NEXT: v_readlane_b32 s2, v1, 6 +; CHECK-NEXT: v_readlane_b32 s3, v1, 7 +; CHECK-NEXT: v_readlane_b32 s4, v1, 8 +; CHECK-NEXT: v_readlane_b32 s5, v1, 9 +; CHECK-NEXT: v_readlane_b32 s6, v1, 10 +; CHECK-NEXT: v_readlane_b32 s7, v1, 11 +; CHECK-NEXT: v_readlane_b32 s8, v1, 12 +; CHECK-NEXT: v_readlane_b32 s9, v1, 13 +; CHECK-NEXT: v_readlane_b32 s10, v1, 14 +; CHECK-NEXT: v_readlane_b32 s11, v1, 15 +; CHECK-NEXT: v_readlane_b32 s12, v1, 16 +; CHECK-NEXT: v_readlane_b32 s13, v1, 17 +; CHECK-NEXT: v_readlane_b32 s14, v1, 18 +; CHECK-NEXT: v_readlane_b32 s15, v1, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 20 +; CHECK-NEXT: v_readlane_b32 s1, v1, 21 +; CHECK-NEXT: v_readlane_b32 s2, v1, 22 +; CHECK-NEXT: v_readlane_b32 s3, v1, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[54:55] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 24 +; CHECK-NEXT: v_readlane_b32 s1, v1, 25 +; CHECK-NEXT: v_readlane_b32 s2, v1, 26 +; CHECK-NEXT: v_readlane_b32 s3, v1, 27 +; CHECK-NEXT: v_readlane_b32 s4, v1, 28 +; CHECK-NEXT: v_readlane_b32 s5, v1, 29 +; CHECK-NEXT: v_readlane_b32 s6, v1, 30 +; CHECK-NEXT: v_readlane_b32 s7, v1, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 32 +; CHECK-NEXT: v_readlane_b32 s1, v1, 33 +; CHECK-NEXT: v_readlane_b32 s2, v1, 34 +; CHECK-NEXT: v_readlane_b32 s3, v1, 35 +; CHECK-NEXT: v_readlane_b32 s4, v1, 36 +; CHECK-NEXT: v_readlane_b32 s5, v1, 37 +; CHECK-NEXT: v_readlane_b32 s6, v1, 38 +; CHECK-NEXT: v_readlane_b32 s7, v1, 39 +; CHECK-NEXT: v_readlane_b32 s8, v1, 40 +; CHECK-NEXT: v_readlane_b32 s9, v1, 41 +; CHECK-NEXT: v_readlane_b32 s10, v1, 42 +; CHECK-NEXT: v_readlane_b32 s11, v1, 43 +; CHECK-NEXT: v_readlane_b32 s12, v1, 44 +; CHECK-NEXT: v_readlane_b32 s13, v1, 45 +; CHECK-NEXT: v_readlane_b32 s14, v1, 46 +; CHECK-NEXT: v_readlane_b32 s15, v1, 47 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,4 +1,6 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0. # Otherwise, the test would crash during PEI while trying to replace the dead frame index. @@ -39,13 +41,21 @@ workGroupIDX: { reg: '$sgpr8' } privateSegmentWaveByteOffset: { reg: '$sgpr9' } body: | - ; CHECK-LABEL: name: test - ; CHECK: bb.0: - ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0 - ; CHECK: DBG_VALUE $noreg, 0 - ; CHECK: bb.1: - ; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0 - ; CHECK: S_ENDPGM 0 + ; SGPR_SPILL-LABEL: name: test + ; SGPR_SPILL: bb.0: + ; SGPR_SPILL: [[VGPR:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; SGPR_SPILL: [[VGPR]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[VGPR]] + ; SGPR_SPILL: DBG_VALUE $noreg, 0 + ; SGPR_SPILL: bb.1: + ; SGPR_SPILL: $sgpr10 = V_READLANE_B32 [[VGPR]], 0 + ; SGPR_SPILL: S_ENDPGM 0 + ; PEI-LABEL: name: test + ; PEI: bb.0: + ; PEI: renamable $[[VGPR:vgpr[0-9]+]] = IMPLICIT_DEF + ; PEI: renamable $[[VGPR]] = V_WRITELANE_B32 killed $sgpr10, 0, killed $[[VGPR]] + ; PEI: bb.1: + ; PEI: $sgpr10 = V_READLANE_B32 killed $[[VGPR]], 0 + ; PEI: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; The first 64 SGPR spills can go to a VGPR, but there isn't a second -; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. +; This test was originally written when SGPRs are spilled directly to physical VGPRs and +; stressed a case when there wasn't enough VGPRs to accommodate all spills. +; When we started spilling them into virtual VGPR lanes, we always succeed in doing so. +; The regalloc pass later takes care of allocating VGPRs to these virtual registers. define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: @@ -23,179 +25,171 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 0 -; GCN-NEXT: v_writelane_b32 v23, s9, 1 -; GCN-NEXT: v_writelane_b32 v23, s10, 2 -; GCN-NEXT: v_writelane_b32 v23, s11, 3 -; GCN-NEXT: v_writelane_b32 v23, s12, 4 -; GCN-NEXT: v_writelane_b32 v23, s13, 5 -; GCN-NEXT: v_writelane_b32 v23, s14, 6 -; GCN-NEXT: v_writelane_b32 v23, s15, 7 -; GCN-NEXT: v_writelane_b32 v23, s16, 8 -; GCN-NEXT: v_writelane_b32 v23, s17, 9 -; GCN-NEXT: v_writelane_b32 v23, s18, 10 -; GCN-NEXT: v_writelane_b32 v23, s19, 11 -; GCN-NEXT: v_writelane_b32 v23, s20, 12 -; GCN-NEXT: v_writelane_b32 v23, s21, 13 -; GCN-NEXT: v_writelane_b32 v23, s22, 14 -; GCN-NEXT: v_writelane_b32 v23, s23, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s8, 0 +; GCN-NEXT: v_writelane_b32 v0, s9, 1 +; GCN-NEXT: v_writelane_b32 v0, s10, 2 +; GCN-NEXT: v_writelane_b32 v0, s11, 3 +; GCN-NEXT: v_writelane_b32 v0, s12, 4 +; GCN-NEXT: v_writelane_b32 v0, s13, 5 +; GCN-NEXT: v_writelane_b32 v0, s14, 6 +; GCN-NEXT: v_writelane_b32 v0, s15, 7 +; GCN-NEXT: v_writelane_b32 v0, s16, 8 +; GCN-NEXT: v_writelane_b32 v0, s17, 9 +; GCN-NEXT: v_writelane_b32 v0, s18, 10 +; GCN-NEXT: v_writelane_b32 v0, s19, 11 +; GCN-NEXT: v_writelane_b32 v0, s20, 12 +; GCN-NEXT: v_writelane_b32 v0, s21, 13 +; GCN-NEXT: v_writelane_b32 v0, s22, 14 +; GCN-NEXT: v_writelane_b32 v0, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 16 -; GCN-NEXT: v_writelane_b32 v23, s9, 17 -; GCN-NEXT: v_writelane_b32 v23, s10, 18 -; GCN-NEXT: v_writelane_b32 v23, s11, 19 -; GCN-NEXT: v_writelane_b32 v23, s12, 20 -; GCN-NEXT: v_writelane_b32 v23, s13, 21 -; GCN-NEXT: v_writelane_b32 v23, s14, 22 -; GCN-NEXT: v_writelane_b32 v23, s15, 23 -; GCN-NEXT: v_writelane_b32 v23, s16, 24 -; GCN-NEXT: v_writelane_b32 v23, s17, 25 -; GCN-NEXT: v_writelane_b32 v23, s18, 26 -; GCN-NEXT: v_writelane_b32 v23, s19, 27 -; GCN-NEXT: v_writelane_b32 v23, s20, 28 -; GCN-NEXT: v_writelane_b32 v23, s21, 29 -; GCN-NEXT: v_writelane_b32 v23, s22, 30 -; GCN-NEXT: v_writelane_b32 v23, s23, 31 +; GCN-NEXT: v_writelane_b32 v0, s8, 16 +; GCN-NEXT: v_writelane_b32 v0, s9, 17 +; GCN-NEXT: v_writelane_b32 v0, s10, 18 +; GCN-NEXT: v_writelane_b32 v0, s11, 19 +; GCN-NEXT: v_writelane_b32 v0, s12, 20 +; GCN-NEXT: v_writelane_b32 v0, s13, 21 +; GCN-NEXT: v_writelane_b32 v0, s14, 22 +; GCN-NEXT: v_writelane_b32 v0, s15, 23 +; GCN-NEXT: v_writelane_b32 v0, s16, 24 +; GCN-NEXT: v_writelane_b32 v0, s17, 25 +; GCN-NEXT: v_writelane_b32 v0, s18, 26 +; GCN-NEXT: v_writelane_b32 v0, s19, 27 +; GCN-NEXT: v_writelane_b32 v0, s20, 28 +; GCN-NEXT: v_writelane_b32 v0, s21, 29 +; GCN-NEXT: v_writelane_b32 v0, s22, 30 +; GCN-NEXT: v_writelane_b32 v0, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 32 -; GCN-NEXT: v_writelane_b32 v23, s9, 33 -; GCN-NEXT: v_writelane_b32 v23, s10, 34 -; GCN-NEXT: v_writelane_b32 v23, s11, 35 -; GCN-NEXT: v_writelane_b32 v23, s12, 36 -; GCN-NEXT: v_writelane_b32 v23, s13, 37 -; GCN-NEXT: v_writelane_b32 v23, s14, 38 -; GCN-NEXT: v_writelane_b32 v23, s15, 39 -; GCN-NEXT: v_writelane_b32 v23, s16, 40 -; GCN-NEXT: v_writelane_b32 v23, s17, 41 -; GCN-NEXT: v_writelane_b32 v23, s18, 42 -; GCN-NEXT: v_writelane_b32 v23, s19, 43 -; GCN-NEXT: v_writelane_b32 v23, s20, 44 -; GCN-NEXT: v_writelane_b32 v23, s21, 45 -; GCN-NEXT: v_writelane_b32 v23, s22, 46 -; GCN-NEXT: v_writelane_b32 v23, s23, 47 +; GCN-NEXT: v_writelane_b32 v0, s8, 32 +; GCN-NEXT: v_writelane_b32 v0, s9, 33 +; GCN-NEXT: v_writelane_b32 v0, s10, 34 +; GCN-NEXT: v_writelane_b32 v0, s11, 35 +; GCN-NEXT: v_writelane_b32 v0, s12, 36 +; GCN-NEXT: v_writelane_b32 v0, s13, 37 +; GCN-NEXT: v_writelane_b32 v0, s14, 38 +; GCN-NEXT: v_writelane_b32 v0, s15, 39 +; GCN-NEXT: v_writelane_b32 v0, s16, 40 +; GCN-NEXT: v_writelane_b32 v0, s17, 41 +; GCN-NEXT: v_writelane_b32 v0, s18, 42 +; GCN-NEXT: v_writelane_b32 v0, s19, 43 +; GCN-NEXT: v_writelane_b32 v0, s20, 44 +; GCN-NEXT: v_writelane_b32 v0, s21, 45 +; GCN-NEXT: v_writelane_b32 v0, s22, 46 +; GCN-NEXT: v_writelane_b32 v0, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 48 -; GCN-NEXT: v_writelane_b32 v23, s9, 49 -; GCN-NEXT: v_writelane_b32 v23, s10, 50 -; GCN-NEXT: v_writelane_b32 v23, s11, 51 -; GCN-NEXT: v_writelane_b32 v23, s12, 52 -; GCN-NEXT: v_writelane_b32 v23, s13, 53 -; GCN-NEXT: v_writelane_b32 v23, s14, 54 -; GCN-NEXT: v_writelane_b32 v23, s15, 55 -; GCN-NEXT: v_writelane_b32 v23, s16, 56 -; GCN-NEXT: v_writelane_b32 v23, s17, 57 -; GCN-NEXT: v_writelane_b32 v23, s18, 58 -; GCN-NEXT: v_writelane_b32 v23, s19, 59 -; GCN-NEXT: v_writelane_b32 v23, s20, 60 -; GCN-NEXT: v_writelane_b32 v23, s21, 61 -; GCN-NEXT: v_writelane_b32 v23, s22, 62 -; GCN-NEXT: v_writelane_b32 v23, s23, 63 +; GCN-NEXT: v_writelane_b32 v0, s8, 48 +; GCN-NEXT: v_writelane_b32 v0, s9, 49 +; GCN-NEXT: v_writelane_b32 v0, s10, 50 +; GCN-NEXT: v_writelane_b32 v0, s11, 51 +; GCN-NEXT: v_writelane_b32 v0, s12, 52 +; GCN-NEXT: v_writelane_b32 v0, s13, 53 +; GCN-NEXT: v_writelane_b32 v0, s14, 54 +; GCN-NEXT: v_writelane_b32 v0, s15, 55 +; GCN-NEXT: v_writelane_b32 v0, s16, 56 +; GCN-NEXT: v_writelane_b32 v0, s17, 57 +; GCN-NEXT: v_writelane_b32 v0, s18, 58 +; GCN-NEXT: v_writelane_b32 v0, s19, 59 +; GCN-NEXT: v_writelane_b32 v0, s20, 60 +; GCN-NEXT: v_writelane_b32 v0, s21, 61 +; GCN-NEXT: v_writelane_b32 v0, s22, 62 +; GCN-NEXT: v_writelane_b32 v0, s23, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[8:9], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, s5 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s4, v23, 0 -; GCN-NEXT: v_readlane_b32 s5, v23, 1 -; GCN-NEXT: v_readlane_b32 s6, v23, 2 -; GCN-NEXT: v_readlane_b32 s7, v23, 3 -; GCN-NEXT: v_readlane_b32 s8, v23, 4 -; GCN-NEXT: v_readlane_b32 s9, v23, 5 -; GCN-NEXT: v_readlane_b32 s10, v23, 6 -; GCN-NEXT: v_readlane_b32 s11, v23, 7 -; GCN-NEXT: v_readlane_b32 s12, v23, 8 -; GCN-NEXT: v_readlane_b32 s13, v23, 9 -; GCN-NEXT: v_readlane_b32 s14, v23, 10 -; GCN-NEXT: v_readlane_b32 s15, v23, 11 -; GCN-NEXT: v_readlane_b32 s16, v23, 12 -; GCN-NEXT: v_readlane_b32 s17, v23, 13 -; GCN-NEXT: v_readlane_b32 s18, v23, 14 -; GCN-NEXT: v_readlane_b32 s19, v23, 15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-NEXT: v_readlane_b32 s6, v1, 2 +; GCN-NEXT: v_readlane_b32 s7, v1, 3 +; GCN-NEXT: v_readlane_b32 s8, v1, 4 +; GCN-NEXT: v_readlane_b32 s9, v1, 5 +; GCN-NEXT: v_readlane_b32 s10, v1, 6 +; GCN-NEXT: v_readlane_b32 s11, v1, 7 +; GCN-NEXT: v_readlane_b32 s12, v1, 8 +; GCN-NEXT: v_readlane_b32 s13, v1, 9 +; GCN-NEXT: v_readlane_b32 s14, v1, 10 +; GCN-NEXT: v_readlane_b32 s15, v1, 11 +; GCN-NEXT: v_readlane_b32 s16, v1, 12 +; GCN-NEXT: v_readlane_b32 s17, v1, 13 +; GCN-NEXT: v_readlane_b32 s18, v1, 14 +; GCN-NEXT: v_readlane_b32 s19, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v23, 16 -; GCN-NEXT: v_readlane_b32 s5, v23, 17 -; GCN-NEXT: v_readlane_b32 s6, v23, 18 -; GCN-NEXT: v_readlane_b32 s7, v23, 19 -; GCN-NEXT: v_readlane_b32 s8, v23, 20 -; GCN-NEXT: v_readlane_b32 s9, v23, 21 -; GCN-NEXT: v_readlane_b32 s10, v23, 22 -; GCN-NEXT: v_readlane_b32 s11, v23, 23 -; GCN-NEXT: v_readlane_b32 s12, v23, 24 -; GCN-NEXT: v_readlane_b32 s13, v23, 25 -; GCN-NEXT: v_readlane_b32 s14, v23, 26 -; GCN-NEXT: v_readlane_b32 s15, v23, 27 -; GCN-NEXT: v_readlane_b32 s16, v23, 28 -; GCN-NEXT: v_readlane_b32 s17, v23, 29 -; GCN-NEXT: v_readlane_b32 s18, v23, 30 -; GCN-NEXT: v_readlane_b32 s19, v23, 31 +; GCN-NEXT: v_readlane_b32 s4, v1, 16 +; GCN-NEXT: v_readlane_b32 s5, v1, 17 +; GCN-NEXT: v_readlane_b32 s6, v1, 18 +; GCN-NEXT: v_readlane_b32 s7, v1, 19 +; GCN-NEXT: v_readlane_b32 s8, v1, 20 +; GCN-NEXT: v_readlane_b32 s9, v1, 21 +; GCN-NEXT: v_readlane_b32 s10, v1, 22 +; GCN-NEXT: v_readlane_b32 s11, v1, 23 +; GCN-NEXT: v_readlane_b32 s12, v1, 24 +; GCN-NEXT: v_readlane_b32 s13, v1, 25 +; GCN-NEXT: v_readlane_b32 s14, v1, 26 +; GCN-NEXT: v_readlane_b32 s15, v1, 27 +; GCN-NEXT: v_readlane_b32 s16, v1, 28 +; GCN-NEXT: v_readlane_b32 s17, v1, 29 +; GCN-NEXT: v_readlane_b32 s18, v1, 30 +; GCN-NEXT: v_readlane_b32 s19, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v23, 32 -; GCN-NEXT: v_readlane_b32 s5, v23, 33 -; GCN-NEXT: v_readlane_b32 s6, v23, 34 -; GCN-NEXT: v_readlane_b32 s7, v23, 35 -; GCN-NEXT: v_readlane_b32 s8, v23, 36 -; GCN-NEXT: v_readlane_b32 s9, v23, 37 -; GCN-NEXT: v_readlane_b32 s10, v23, 38 -; GCN-NEXT: v_readlane_b32 s11, v23, 39 -; GCN-NEXT: v_readlane_b32 s12, v23, 40 -; GCN-NEXT: v_readlane_b32 s13, v23, 41 -; GCN-NEXT: v_readlane_b32 s14, v23, 42 -; GCN-NEXT: v_readlane_b32 s15, v23, 43 -; GCN-NEXT: v_readlane_b32 s16, v23, 44 -; GCN-NEXT: v_readlane_b32 s17, v23, 45 -; GCN-NEXT: v_readlane_b32 s18, v23, 46 -; GCN-NEXT: v_readlane_b32 s19, v23, 47 +; GCN-NEXT: v_readlane_b32 s4, v1, 32 +; GCN-NEXT: v_readlane_b32 s5, v1, 33 +; GCN-NEXT: v_readlane_b32 s6, v1, 34 +; GCN-NEXT: v_readlane_b32 s7, v1, 35 +; GCN-NEXT: v_readlane_b32 s8, v1, 36 +; GCN-NEXT: v_readlane_b32 s9, v1, 37 +; GCN-NEXT: v_readlane_b32 s10, v1, 38 +; GCN-NEXT: v_readlane_b32 s11, v1, 39 +; GCN-NEXT: v_readlane_b32 s12, v1, 40 +; GCN-NEXT: v_readlane_b32 s13, v1, 41 +; GCN-NEXT: v_readlane_b32 s14, v1, 42 +; GCN-NEXT: v_readlane_b32 s15, v1, 43 +; GCN-NEXT: v_readlane_b32 s16, v1, 44 +; GCN-NEXT: v_readlane_b32 s17, v1, 45 +; GCN-NEXT: v_readlane_b32 s18, v1, 46 +; GCN-NEXT: v_readlane_b32 s19, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s8, v23, 48 -; GCN-NEXT: v_readlane_b32 s9, v23, 49 -; GCN-NEXT: v_readlane_b32 s10, v23, 50 -; GCN-NEXT: v_readlane_b32 s11, v23, 51 -; GCN-NEXT: v_readlane_b32 s12, v23, 52 -; GCN-NEXT: v_readlane_b32 s13, v23, 53 -; GCN-NEXT: v_readlane_b32 s14, v23, 54 -; GCN-NEXT: v_readlane_b32 s15, v23, 55 -; GCN-NEXT: v_readlane_b32 s16, v23, 56 -; GCN-NEXT: v_readlane_b32 s17, v23, 57 -; GCN-NEXT: v_readlane_b32 s18, v23, 58 -; GCN-NEXT: v_readlane_b32 s19, v23, 59 -; GCN-NEXT: v_readlane_b32 s20, v23, 60 -; GCN-NEXT: v_readlane_b32 s21, v23, 61 -; GCN-NEXT: v_readlane_b32 s22, v23, 62 -; GCN-NEXT: v_readlane_b32 s23, v23, 63 -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s8, v1, 48 +; GCN-NEXT: v_readlane_b32 s9, v1, 49 +; GCN-NEXT: v_readlane_b32 s10, v1, 50 +; GCN-NEXT: v_readlane_b32 s11, v1, 51 +; GCN-NEXT: v_readlane_b32 s12, v1, 52 +; GCN-NEXT: v_readlane_b32 s13, v1, 53 +; GCN-NEXT: v_readlane_b32 s14, v1, 54 +; GCN-NEXT: v_readlane_b32 s15, v1, 55 +; GCN-NEXT: v_readlane_b32 s16, v1, 56 +; GCN-NEXT: v_readlane_b32 s17, v1, 57 +; GCN-NEXT: v_readlane_b32 s18, v1, 58 +; GCN-NEXT: v_readlane_b32 s19, v1, 59 +; GCN-NEXT: v_readlane_b32 s20, v1, 60 +; GCN-NEXT: v_readlane_b32 s21, v1, 61 +; GCN-NEXT: v_readlane_b32 s22, v1, 62 +; GCN-NEXT: v_readlane_b32 s23, v1, 63 ; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[8:23] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -20,10 +20,11 @@ liveins: $sgpr4 ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 - ; CHECK: liveins: $sgpr4, $vgpr0 + ; CHECK: liveins: $sgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... @@ -45,10 +46,11 @@ liveins: $sgpr5 ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 - ; CHECK: liveins: $sgpr5, $vgpr0 + ; CHECK: liveins: $sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s # Check that we allocate 2 emergency stack slots if we're spilling # SGPRs to memory and potentially have an offset larger than fits in @@ -21,7 +21,7 @@ # CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) -# CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr1 +# CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr1 # CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) # CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -5,16 +5,23 @@ ret void } +; SGPRs are now spilled into virtual VGPRs and regalloc takes care of finding +; physical VGPRs and this test shouldn't take the high regsiter for spill lanes. ; GCN-LABEL: {{^}}spill_sgpr_with_no_lower_vgpr_available: -; GCN: buffer_store_dword v255, off, s[0:3], s32 +; GCN: buffer_store_dword [[LANE_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 ; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: v_writelane_b32 v255, s30, 0 -; GCN: v_writelane_b32 v255, s31, 1 +; GCN: v_writelane_b32 [[LANE_VGPR]], s30, 0 +; GCN: v_writelane_b32 [[LANE_VGPR]], s31, 1 +; GCN-NOT: v_writelane_b32 v255, s30, 0 +; GCN-NOT: v_writelane_b32 v255, s31, 1 ; GCN: s_swappc_b64 s[30:31], s[4:5] -; GCN: v_readlane_b32 s31, v255, 1 -; GCN: v_readlane_b32 s30, v255, 0 +; GCN: v_readlane_b32 s31, [[LANE_VGPR]], 1 +; GCN: v_readlane_b32 s30, [[LANE_VGPR]], 0 +; GCN-NOT: v_readlane_b32 s31, v255, 1 +; GCN-NOT: v_readlane_b32 s30, v255, 0 ; GCN: s_mov_b32 s33, [[TMP_SGPR]] -; GCN: ; NumVgprs: 256 +; GCN: ; NumVgprs: 255 define void @spill_sgpr_with_no_lower_vgpr_available() #0 { %alloca = alloca i32, align 4, addrspace(5) @@ -52,13 +59,18 @@ } ; GCN-LABEL: {{^}}spill_to_lowest_available_vgpr: -; GCN: buffer_store_dword v254, off, s[0:3], s32 +; GCN: buffer_store_dword [[LANE_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; GCN-NOT: buffer_store_dword v254, off, s[0:3], s32 ; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: v_writelane_b32 v254, s30, 0 -; GCN: v_writelane_b32 v254, s31, 1 +; GCN: v_writelane_b32 [[LANE_VGPR]], s30, 0 +; GCN: v_writelane_b32 [[LANE_VGPR]], s31, 1 +; GCN-NOT: v_writelane_b32 v254, s30, 0 +; GCN-NOT: v_writelane_b32 v254, s31, 1 ; GCN: s_swappc_b64 s[30:31], s[4:5] -; GCN: v_readlane_b32 s31, v254, 1 -; GCN: v_readlane_b32 s30, v254, 0 +; GCN: v_readlane_b32 s31, [[LANE_VGPR]], 1 +; GCN: v_readlane_b32 s30, [[LANE_VGPR]], 0 +; GCN-NOT: v_readlane_b32 s31, v254, 1 +; GCN-NOT: v_readlane_b32 s30, v254, 0 ; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @spill_to_lowest_available_vgpr() #0 { @@ -97,10 +109,14 @@ } ; GCN-LABEL: {{^}}spill_sgpr_with_sgpr_uses: +; GCN: buffer_store_dword [[LANE_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 +; GCN-NOT: buffer_store_dword v254, off, s[0:3], s32 ; GCN: ; def s4 -; GCN: v_writelane_b32 v254, s4, 0 -; GCN: v_readlane_b32 s4, v254, 0 +; GCN: v_writelane_b32 [[LANE_VGPR]], s4, 0 +; GCN: v_readlane_b32 s4, [[LANE_VGPR]], 0 +; GCN-NOT: v_writelane_b32 v254, s4, 0 +; GCN-NOT: v_readlane_b32 s4, v254, 0 ; GCN: ; use s4 define void @spill_sgpr_with_sgpr_uses() #0 { @@ -192,22 +208,22 @@ ; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr: ; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0 -; GCN: buffer_store_dword [[A]], off, s[0:3], s32 -; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0 -; GCN: buffer_store_dword [[B]], off, s[0:3], s32 -; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0 -; GCN: buffer_store_dword [[C]], off, s[0:3], s32 -; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0 -; GCN: buffer_store_dword [[D]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[A]], s35, 1 +; GCN: v_writelane_b32 [[A]], s36, 2 +; GCN: v_writelane_b32 [[A]], s37, 3 +; GCN: buffer_store_dword [[A]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s35, 0 +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s36, 0 +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s37, 0 ; GCN: #ASMEND -; GCN: buffer_load_dword [[E:v[0-9]+]] -; GCN: v_readlane_b32 s37, [[E]], 0 -; GCN: buffer_load_dword [[F:v[0-9]+]] -; GCN: v_readlane_b32 s36, [[F]], 0 -; GCN: buffer_load_dword [[G:v[0-9]+]] -; GCN: v_readlane_b32 s35, [[G]], 0 -; GCN: buffer_load_dword [[H:v[0-9]+]] -; GCN: v_readlane_b32 s34, [[H]], 0 +; GCN-NOT: v_readlane_b32 s37, v{{[0-9]+}}, 0 +; GCN-NOT: v_readlane_b32 s36, v{{[0-9]+}}, 0 +; GCN-NOT: v_readlane_b32 s35, v{{[0-9]+}}, 0 +; GCN: buffer_load_dword [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF]] ; 4-byte Folded Reload +; GCN: v_readlane_b32 s37, [[B]], 3 +; GCN: v_readlane_b32 s36, [[B]], 2 +; GCN: v_readlane_b32 s35, [[B]], 1 +; GCN: v_readlane_b32 s34, [[B]], 0 define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %a = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -282,15 +298,15 @@ } ; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra: -; GCN: v_writelane_b32 v0, s30, 0 -; GCN: buffer_store_dword v0, off -; GCN: v_writelane_b32 v0, s31, 0 -; GCN: buffer_store_dword v0, off +; GCN: v_writelane_b32 [[A:v[0-9]+]], s30, 0 +; GCN: v_writelane_b32 [[A]], s31, 1 +; GCN: buffer_store_dword [[A]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s31, 0 ; GCN: swappc -; GCN: buffer_load_dword v0, off -; GCN: v_readlane_b32 s31, v0, 0 -; GCN: buffer_load_dword v0, off -; GCN: v_readlane_b32 s30, v0, 0 +; GCN-NOT: v_readlane_b32 s31, v{{[0-9]+}}, 0 +; GCN: buffer_load_dword [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[IDX_OFF]] ; 4-byte Folded Reload +; GCN: v_readlane_b32 s31, [[B]], 1 +; GCN: v_readlane_b32 s30, [[B]], 0 define void @spill_sgpr_no_free_vgpr_ipra() #0 { call void @child_function_ipra() ret void diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -7,15 +7,13 @@ ; Make sure we are handling hazards correctly. ; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 +; SGPR: v_mov_b32_e32 v0, s100 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0 -; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: s_mov_b64 exec, s[4:5] -; SGPR-NEXT: s_nop 1 +; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; ALL: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -210,15 +210,15 @@ ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,19 +2,19 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v41, s33, 0 +; GCN: v_writelane_b32 v40, s33, 0 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v41, 0 +; GCN: v_readlane_b32 s33, v40, 0 ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted # by the copy propagation after lowering the spill. @@ -26,11 +26,12 @@ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 ; GCN-NEXT: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -63,10 +64,11 @@ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -93,12 +95,12 @@ ; GCN-LABEL: name: spill_vgpr128_use_subreg ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) - ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1 + ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) @@ -123,11 +125,11 @@ ; GCN-LABEL: name: spill_vgpr128_use_kill ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir @@ -10,10 +10,12 @@ bb.0: liveins: $sgpr50 ; CHECK-LABEL: name: spill_csr_sgpr_argument - ; CHECK: liveins: $sgpr50, $vgpr0 - ; CHECK: $vgpr0 = V_WRITELANE_B32 $sgpr50, 0, $vgpr0 - ; CHECK: S_NOP 0, implicit $sgpr50 - ; CHECK: $sgpr50 = S_MOV_B32 0 + ; CHECK: liveins: $sgpr50 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr50, 0, [[V_WRITELANE_B32_]] + ; CHECK-NEXT: S_NOP 0, implicit $sgpr50 + ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0 S_NOP 0, implicit $sgpr50 $sgpr50 = S_MOV_B32 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; Spill an SGPR to scratch without having spare SGPRs available to save exec +; The test was originally written to spill an SGPR to scratch without having spare SGPRs available to save exec. +; This scenario no longer exists when we enabled SGPR spill into virtual VGPRs. define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: @@ -18,31 +19,19 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[8:12] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NOT: s_not_b64 exec, exec +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s8, 0 ; GFX10-NEXT: v_writelane_b32 v0, s9, 1 ; GFX10-NEXT: v_writelane_b32 v0, s10, 2 ; GFX10-NEXT: v_writelane_b32 v0, s11, 3 ; GFX10-NEXT: v_writelane_b32 v0, s12, 4 ; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:7] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b64 s[6:7], exec -; GFX10-NEXT: s_mov_b64 exec, 31 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readlane_b32 s0, v0, 0 @@ -50,10 +39,6 @@ ; GFX10-NEXT: v_readlane_b32 s2, v0, 2 ; GFX10-NEXT: v_readlane_b32 s3, v0, 3 ; GFX10-NEXT: v_readlane_b32 s4, v0, 4 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 exec, s[6:7] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:4] ; GFX10-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -0,0 +1,320 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction. +--- +name: sgpr32_spill +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $sgpr10 + ; GCN-LABEL: name: sgpr32_spill + ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 + S_NOP 0 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31 +... + +# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple. +# There must be two implicit def for the two lane VGPRs. + +--- +name: sgpr_spill_lane_crossover +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } + - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-LABEL: name: sgpr_spill_lane_crossover + ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr64, 0, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr65, 1, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr66, 2, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr67, 3, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr68, 4, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr69, 5, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr70, 6, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr71, 7, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr72, 8, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr73, 9, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr74, 10, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr75, 11, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr76, 12, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr77, 13, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr78, 14, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr79, 15, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr80, 16, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr81, 17, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr82, 18, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr83, 19, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr84, 20, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr85, 21, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr86, 22, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr87, 23, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr88, 24, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr89, 25, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr90, 26, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr91, 27, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr92, 28, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr93, 29, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr94, 30, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 31, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 32, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 33, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 34, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 35, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 36, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 37, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 38, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 39, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 40, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 41, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 42, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 43, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 44, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 45, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 46, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 47, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 48, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 49, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 50, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 51, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 52, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 53, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 54, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 55, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 56, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 57, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 58, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 59, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 60, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 61, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 62, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 63, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_2]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 0, [[V_WRITELANE_B32_2]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 33, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 34 + ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 35 + ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 36 + ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 37 + ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 38 + ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 39 + ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 40 + ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 41 + ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 42 + ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 43 + ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 44 + ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 45 + ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 46 + ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 47 + ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 48 + ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 49 + ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 50 + ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 51 + ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 52 + ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 53 + ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 54 + ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 55 + ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 56 + ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 57 + ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 58 + ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 59 + ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 60 + ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 61 + ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 62 + ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 63 + ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_2]], 0 + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 + S_NOP 0 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31 +... + +# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here). + +--- +name: lane_vgpr_implicit_def_at_common_dominator_block +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 + bb.0: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + S_NOP 0 + S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + bb.1: + liveins: $sgpr10, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 10 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_BRANCH %bb.3 + bb.2: + liveins: $sgpr10, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 20 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 +... + +# The common dominator block is visited only at the end. The insertion point was initially identified to the +# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block. + +--- +name: dominator_block_follows_the_successors_bbs +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 + ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 + ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 + bb.0: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + S_NOP 0 + S_BRANCH %bb.3 + bb.1: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc + S_BRANCH %bb.2 + bb.2: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 10 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + S_BRANCH %bb.1 + bb.4: + liveins: $sgpr10, $sgpr30_sgpr31 + S_NOP 0 + S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -13,6 +13,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s35, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -32,32 +32,29 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr192 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 bb.0: S_NOP 0, implicit-def %0:sgpr_192 diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -30,34 +30,31 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr224 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 bb.0: S_NOP 0, implicit-def %0:sgpr_224 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -20,8 +20,9 @@ ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_mov_b32 s36, s33 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v1, s30, 1 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -13,6 +13,7 @@ ; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -185,35 +186,36 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v46, s33, 0 +; GCN-NEXT: v_writelane_b32 v45, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s48, 15 -; GCN-NEXT: v_writelane_b32 v40, s49, 16 -; GCN-NEXT: v_mov_b32_e32 v41, v31 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-NEXT: v_writelane_b32 v0, s34, 2 +; GCN-NEXT: v_writelane_b32 v0, s35, 3 +; GCN-NEXT: v_writelane_b32 v0, s36, 4 +; GCN-NEXT: v_writelane_b32 v0, s37, 5 +; GCN-NEXT: v_writelane_b32 v0, s38, 6 +; GCN-NEXT: v_writelane_b32 v0, s39, 7 +; GCN-NEXT: v_writelane_b32 v0, s40, 8 +; GCN-NEXT: v_writelane_b32 v0, s41, 9 +; GCN-NEXT: v_writelane_b32 v0, s42, 10 +; GCN-NEXT: v_writelane_b32 v0, s43, 11 +; GCN-NEXT: v_writelane_b32 v0, s44, 12 +; GCN-NEXT: v_writelane_b32 v0, s45, 13 +; GCN-NEXT: v_writelane_b32 v0, s46, 14 +; GCN-NEXT: v_writelane_b32 v0, s48, 15 +; GCN-NEXT: v_writelane_b32 v0, s49, 16 +; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_mov_b32 s44, s14 ; GCN-NEXT: s_mov_b32 s45, s13 ; GCN-NEXT: s_mov_b32 s46, s12 @@ -224,24 +226,24 @@ ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: flat_load_dword v44, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v43, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_getpc_b64 s[48:49] ; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v44 +; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v43 ; GCN-NEXT: s_branch .LBB1_3 ; GCN-NEXT: .LBB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: .LBB1_2: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .LBB1_3: ; %bb2 ; GCN-NEXT: ; =>This Loop Header: Depth=1 @@ -250,8 +252,8 @@ ; GCN-NEXT: .LBB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[42:43] -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: flat_load_dword v0, v[41:42] +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -282,7 +284,7 @@ ; GCN-NEXT: s_mov_b32 s12, s46 ; GCN-NEXT: s_mov_b32 s13, s45 ; GCN-NEXT: s_mov_b32 s14, s44 -; GCN-NEXT: v_mov_b32_e32 v31, v41 +; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 @@ -297,10 +299,10 @@ ; GCN-NEXT: ; %bb.9: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: .LBB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB1_2 bb: %tmp = load float, float* null, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -11,7 +11,7 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0 @@ -29,10 +29,10 @@ ; GFX9-NEXT: s_mov_b32 s9, s4 ; GFX9-NEXT: s_mov_b32 s10, s4 ; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART @@ -41,30 +41,31 @@ ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v41 -; GFX9-NEXT: v_mov_b32_e32 v1, v42 -; GFX9-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-NEXT: v_mov_b32_e32 v3, v44 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mov_b32_e32 v1, v41 +; GFX9-NEXT: v_mov_b32_e32 v2, v42 +; GFX9-NEXT: v_mov_b32_e32 v3, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-NEXT: v_readlane_b32 s30, v44, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v45, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -75,7 +76,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 @@ -94,10 +95,10 @@ ; GFX10-NEXT: s_mov_b32 s9, s4 ; GFX10-NEXT: s_mov_b32 s10, s4 ; GFX10-NEXT: s_mov_b32 s11, s4 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART @@ -106,33 +107,34 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr44 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v44, s30, 0 +; GFX10-NEXT: v_writelane_b32 v44, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v41 -; GFX10-NEXT: v_mov_b32_e32 v1, v42 -; GFX10-NEXT: v_mov_b32_e32 v2, v43 -; GFX10-NEXT: v_mov_b32_e32 v3, v44 +; GFX10-NEXT: v_mov_b32_e32 v0, v40 +; GFX10-NEXT: v_mov_b32_e32 v1, v41 +; GFX10-NEXT: v_mov_b32_e32 v2, v42 +; GFX10-NEXT: v_mov_b32_e32 v3, v43 ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10-NEXT: v_readlane_b32 s31, v44, 1 +; GFX10-NEXT: v_readlane_b32 s30, v44, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: v_readlane_b32 s33, v45, 0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 @@ -167,32 +169,33 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s36, 2 -; GFX9-NEXT: v_writelane_b32 v40, s37, 3 -; GFX9-NEXT: v_writelane_b32 v40, s38, 4 -; GFX9-NEXT: v_writelane_b32 v40, s39, 5 -; GFX9-NEXT: v_writelane_b32 v40, s40, 6 -; GFX9-NEXT: v_writelane_b32 v40, s41, 7 +; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: v_writelane_b32 v46, s33, 0 +; GFX9-NEXT: v_writelane_b32 v45, s30, 0 +; GFX9-NEXT: v_writelane_b32 v45, s31, 1 +; GFX9-NEXT: v_writelane_b32 v45, s36, 2 +; GFX9-NEXT: v_writelane_b32 v45, s37, 3 +; GFX9-NEXT: v_writelane_b32 v45, s38, 4 +; GFX9-NEXT: v_writelane_b32 v45, s39, 5 +; GFX9-NEXT: v_writelane_b32 v45, s40, 6 +; GFX9-NEXT: v_writelane_b32 v45, s41, 7 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v40, s42, 8 +; GFX9-NEXT: v_writelane_b32 v45, s42, 8 ; GFX9-NEXT: s_mov_b32 s36, 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s43, 9 -; GFX9-NEXT: v_mov_b32_e32 v45, v16 -; GFX9-NEXT: v_mov_b32_e32 v44, v15 -; GFX9-NEXT: v_mov_b32_e32 v43, v14 -; GFX9-NEXT: v_mov_b32_e32 v42, v13 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v45, s43, 9 +; GFX9-NEXT: v_mov_b32_e32 v44, v16 +; GFX9-NEXT: v_mov_b32_e32 v43, v15 +; GFX9-NEXT: v_mov_b32_e32 v42, v14 +; GFX9-NEXT: v_mov_b32_e32 v41, v13 +; GFX9-NEXT: v_mov_b32_e32 v40, v12 ; GFX9-NEXT: s_mov_b32 s37, s36 ; GFX9-NEXT: s_mov_b32 s38, s36 ; GFX9-NEXT: s_mov_b32 s39, s36 @@ -200,7 +203,7 @@ ; GFX9-NEXT: s_mov_b32 s41, s36 ; GFX9-NEXT: s_mov_b32 s42, s36 ; GFX9-NEXT: s_mov_b32 s43, s36 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[36:43], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 @@ -210,27 +213,27 @@ ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[36:43], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s43, v40, 9 -; GFX9-NEXT: v_readlane_b32 s42, v40, 8 -; GFX9-NEXT: v_readlane_b32 s41, v40, 7 -; GFX9-NEXT: v_readlane_b32 s40, v40, 6 -; GFX9-NEXT: v_readlane_b32 s39, v40, 5 -; GFX9-NEXT: v_readlane_b32 s38, v40, 4 -; GFX9-NEXT: v_readlane_b32 s37, v40, 3 -; GFX9-NEXT: v_readlane_b32 s36, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s43, v45, 9 +; GFX9-NEXT: v_readlane_b32 s42, v45, 8 +; GFX9-NEXT: v_readlane_b32 s41, v45, 7 +; GFX9-NEXT: v_readlane_b32 s40, v45, 6 +; GFX9-NEXT: v_readlane_b32 s39, v45, 5 +; GFX9-NEXT: v_readlane_b32 s38, v45, 4 +; GFX9-NEXT: v_readlane_b32 s37, v45, 3 +; GFX9-NEXT: v_readlane_b32 s36, v45, 2 +; GFX9-NEXT: v_readlane_b32 s31, v45, 1 +; GFX9-NEXT: v_readlane_b32 s30, v45, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v46, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -241,40 +244,41 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr45 ; GFX10-NEXT: v_writelane_b32 v46, s33, 0 +; GFX10-NEXT: v_writelane_b32 v45, s30, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v41, v16 -; GFX10-NEXT: v_mov_b32_e32 v42, v15 -; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_writelane_b32 v40, s36, 2 +; GFX10-NEXT: v_writelane_b32 v45, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-NEXT: v_mov_b32_e32 v42, v14 +; GFX10-NEXT: v_mov_b32_e32 v43, v13 +; GFX10-NEXT: v_writelane_b32 v45, s36, 2 ; GFX10-NEXT: s_mov_b32 s36, 0 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 -; GFX10-NEXT: v_mov_b32_e32 v45, v12 -; GFX10-NEXT: v_writelane_b32 v40, s37, 3 +; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: v_writelane_b32 v45, s37, 3 ; GFX10-NEXT: s_mov_b32 s37, s36 -; GFX10-NEXT: v_writelane_b32 v40, s38, 4 +; GFX10-NEXT: v_writelane_b32 v45, s38, 4 ; GFX10-NEXT: s_mov_b32 s38, s36 -; GFX10-NEXT: v_writelane_b32 v40, s39, 5 +; GFX10-NEXT: v_writelane_b32 v45, s39, 5 ; GFX10-NEXT: s_mov_b32 s39, s36 -; GFX10-NEXT: v_writelane_b32 v40, s40, 6 +; GFX10-NEXT: v_writelane_b32 v45, s40, 6 ; GFX10-NEXT: s_mov_b32 s40, s36 -; GFX10-NEXT: v_writelane_b32 v40, s41, 7 +; GFX10-NEXT: v_writelane_b32 v45, s41, 7 ; GFX10-NEXT: s_mov_b32 s41, s36 -; GFX10-NEXT: v_writelane_b32 v40, s42, 8 +; GFX10-NEXT: v_writelane_b32 v45, s42, 8 ; GFX10-NEXT: s_mov_b32 s42, s36 -; GFX10-NEXT: v_writelane_b32 v40, s43, 9 +; GFX10-NEXT: v_writelane_b32 v45, s43, 9 ; GFX10-NEXT: s_mov_b32 s43, s36 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -286,28 +290,28 @@ ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s43, v40, 9 -; GFX10-NEXT: v_readlane_b32 s42, v40, 8 -; GFX10-NEXT: v_readlane_b32 s41, v40, 7 -; GFX10-NEXT: v_readlane_b32 s40, v40, 6 -; GFX10-NEXT: v_readlane_b32 s39, v40, 5 -; GFX10-NEXT: v_readlane_b32 s38, v40, 4 -; GFX10-NEXT: v_readlane_b32 s37, v40, 3 -; GFX10-NEXT: v_readlane_b32 s36, v40, 2 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: v_readlane_b32 s43, v45, 9 +; GFX10-NEXT: v_readlane_b32 s42, v45, 8 +; GFX10-NEXT: v_readlane_b32 s41, v45, 7 +; GFX10-NEXT: v_readlane_b32 s40, v45, 6 +; GFX10-NEXT: v_readlane_b32 s39, v45, 5 +; GFX10-NEXT: v_readlane_b32 s38, v45, 4 +; GFX10-NEXT: v_readlane_b32 s37, v45, 3 +; GFX10-NEXT: v_readlane_b32 s36, v45, 2 +; GFX10-NEXT: v_readlane_b32 s31, v45, 1 +; GFX10-NEXT: v_readlane_b32 s30, v45, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: v_readlane_b32 s33, v46, 0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -133,9 +133,10 @@ ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 @@ -144,10 +145,12 @@ ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] ; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] -; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v5, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v5, s43, 3 +; GFX9-O0-NEXT: ; implicit-def: $vgpr3 +; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -170,9 +173,12 @@ ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 5 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 @@ -196,13 +202,15 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 5 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 3 +; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 3 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -215,9 +223,10 @@ ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -332,15 +341,17 @@ ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s35, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x800 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -367,14 +378,16 @@ ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 ; GFX9-O0-NEXT: s_mov_b32 s33, s35 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -387,8 +400,9 @@ ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O3-NEXT: ; implicit-def: $vgpr3 ; GFX9-O3-NEXT: s_mov_b32 s38, s33 +; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 @@ -511,36 +525,37 @@ ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s42, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 -; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 ; GFX9-O0-NEXT: s_mov_b32 s38, s6 ; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: v_writelane_b32 v10, s36, 2 -; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 -; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 -; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 3 +; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 5 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35 @@ -552,8 +567,9 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 -; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 7 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -570,12 +586,14 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 -; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 -; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 7 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 @@ -587,31 +605,33 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 ; GFX9-O0-NEXT: s_mov_b32 s33, s42 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -630,8 +650,9 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 +; GFX9-O3-NEXT: ; implicit-def: $vgpr8 ; GFX9-O3-NEXT: s_mov_b32 s40, s33 +; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1