diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -69,25 +69,7 @@ // We need to save and restore the current FP/BP. - // 1: If there is already a VGPR with free lanes, use it. We - // may already have to pay the penalty for spilling a CSR VGPR. - if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { - int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, - TargetStackID::SGPRSpill); - - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - - FrameIndex = NewFI; - - LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane - << '\n'); - return; - } - - // 2: Next, try to save the FP/BP in an unused SGPR. + // 1: Try to save the FP/BP in an unused SGPR. TempSGPR = findScratchNonCalleeSaveRegister( MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); @@ -95,21 +77,20 @@ int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, TargetStackID::SGPRSpill); - if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { - // 3: There's no free lane to spill, and no free register to save FP/BP, + if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPRLane( + MF, NewFI, /* IsPrologEpilog */ true)) { + // 2: There's no free lane to spill, and no free register to save FP/BP, // so we're forced to spill another VGPR to use for the spill. - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - MFI->allocateWWMSpill(MF, Spill.VGPR); - FrameIndex = NewFI; LLVM_DEBUG( + auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(NewFI).front(); dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); } else { // Remove dead index MF.getFrameInfo().RemoveStackObject(NewFI); - // 4: If all else fails, spill the FP/BP to memory. + // 3: If all else fails, spill the FP/BP to memory. FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " << (IsFP ? "FP" : "BP") << '\n'); @@ -822,7 +803,7 @@ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRToVGPRSpills(FI); + FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) @@ -1020,7 +1001,7 @@ auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRToVGPRSpills(FI); + FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg) .addReg(Spill[0].VGPR) @@ -1266,13 +1247,6 @@ } } - for (MachineBasicBlock &MBB : MF) { - for (auto &Reg : MFI->getWWMSpills()) - MBB.addLiveIn(Reg.first); - - MBB.sortUniqueLiveIns(); - } - // Ignore the SGPRs the default implementation found. SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); @@ -1318,6 +1292,14 @@ getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, MFI->BasePointerSaveIndex, false); } + + // Mark all lane VGPRs as BB LiveIns. + for (MachineBasicBlock &MBB : MF) { + for (auto &Reg : MFI->getWWMSpills()) + MBB.addLiveIn(Reg.first); + + MBB.sortUniqueLiveIns(); + } } void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -296,7 +296,7 @@ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { NewReservedRegs = true; bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -438,10 +438,15 @@ }; private: - // Track VGPR + wave index for each subregister of the SGPR spilled to - // frameindex key. - DenseMap> SGPRToVGPRSpills; + // To track VGPR + lane index for each subregister of the SGPR spilled to + // frameindex key during SILowerSGPRSpills pass. + DenseMap> SGPRSpillToVGPRLanes; + // To track VGPR + lane index for spilling special SGPRs like Frame Pointer + // identified during PrologEpilogInserter. + DenseMap> + PrologEpilogSGPRSpillToVGPRLanes; unsigned NumVGPRSpillLanes = 0; + unsigned NumVGPRPrologEpilogSpillLanes = 0; SmallVector SpillVGPRs; using WWMSpillsMap = MapVector; // To track the registers used in instructions that can potentially modify the @@ -474,6 +479,11 @@ private: Register VGPRForAGPRCopy; + bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + public: Register getVGPRForAGPRCopy() const { return VGPRForAGPRCopy; @@ -517,9 +527,9 @@ } ArrayRef - getSGPRToVGPRSpills(int FrameIndex) const { - auto I = SGPRToVGPRSpills.find(FrameIndex); - return (I == SGPRToVGPRSpills.end()) + getSGPRSpillToVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillToVGPRLanes.find(FrameIndex); + return (I == SGPRSpillToVGPRLanes.end()) ? ArrayRef() : makeArrayRef(I->second); } @@ -528,6 +538,14 @@ const WWMSpillsMap &getWWMSpills() const { return WWMSpills; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } + ArrayRef + getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { + auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); + return (I == PrologEpilogSGPRSpillToVGPRLanes.end()) + ? ArrayRef() + : makeArrayRef(I->second); + } + void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -551,9 +569,8 @@ I->second.IsDead = true; } - bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, - unsigned NumLane) const; - bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); + bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, + bool IsPrologEpilog = false); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -290,21 +290,67 @@ return false; } -/// \p returns true if \p NumLanes slots are available in VGPRs already used for -/// SGPR spilling. -// -// FIXME: This only works after processFunctionBeforeFrameFinalized -bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, - unsigned NumNeed) const { +bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, + int FI, + unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget(); - unsigned WaveSize = ST.getWavefrontSize(); - return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register LaneVGPR; + if (!LaneIndex) { + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we will not + // partially spill the SGPR to VGPRs. + SGPRSpillToVGPRLanes.erase(FI); + return false; + } + + SpillVGPRs.push_back(LaneVGPR); + // Add this register as live-in to all blocks to avoid machine verifier + // complaining about use of an undefined physical register. + for (MachineBasicBlock &BB : MF) + BB.addLiveIn(LaneVGPR); + } else { + LaneVGPR = SpillVGPRs.back(); + } + + SGPRSpillToVGPRLanes[FI].push_back( + SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); + return true; } -/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. -bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, - int FI) { - std::vector &SpillLanes = SGPRToVGPRSpills[FI]; +bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( + MachineFunction &MF, int FI, unsigned LaneIndex) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register LaneVGPR; + if (!LaneIndex) { + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we will not + // partially spill the SGPR to VGPRs. + PrologEpilogSGPRSpillToVGPRLanes.erase(FI); + return false; + } + + allocateWWMSpill(MF, LaneVGPR); + } else { + LaneVGPR = WWMSpills.back().first; + } + + PrologEpilogSGPRSpillToVGPRLanes[FI].push_back( + SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); + return true; +} + +bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, + int FI, + bool IsPrologEpilog) { + std::vector &SpillLanes = + IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI] + : SGPRSpillToVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -313,7 +359,6 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned WaveSize = ST.getWavefrontSize(); unsigned Size = FrameInfo.getObjectSize(FI); @@ -325,42 +370,20 @@ assert(Size >= 4 && "invalid sgpr spill size"); assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); - // Make sure to handle the case where a wide SGPR spill may span between two - // VGPRs. - for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { - Register LaneVGPR; - unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); - - if (VGPRIndex == 0) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRToVGPRSpills.erase(FI); - NumVGPRSpillLanes -= I; - - // FIXME: We can run out of free registers with split allocation if - // IPRA is enabled and a called function already uses every VGPR. -#if 0 - DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), - "VGPRs for SGPR spilling", - 0, DS_Error); - MF.getFunction().getContext().diagnose(DiagOutOfRegs); -#endif - return false; - } + unsigned &NumSpillLanes = + IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes; - SpillVGPRs.push_back(LaneVGPR); + for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { + unsigned LaneIndex = (NumSpillLanes % WaveSize); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); - } else { - LaneVGPR = SpillVGPRs.back(); + bool Allocated = + IsPrologEpilog + ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex) + : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); + if (!Allocated) { + NumSpillLanes -= I; + return false; } - - SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex)); } return true; @@ -437,16 +460,14 @@ bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame, however keep FP & BP since - // spills for them haven't been inserted yet. And also make sure to remove the - // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could - // result in an unexpected side effect and bug, in case of any re-mapping of - // freed frame indices by later pass(es) like "stack slot coloring". - for (auto &R : make_early_inc_range(SGPRToVGPRSpills)) { - if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) { - MFI.RemoveStackObject(R.first); - SGPRToVGPRSpills.erase(R.first); - } + // Remove dead frame indices from function frame. And also make sure to remove + // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it + // could result in an unexpected side effect and bug, in case of any + // re-mapping of freed frame indices by later pass(es) like "stack slot + // coloring". + for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) { + MFI.RemoveStackObject(R.first); + SGPRSpillToVGPRLanes.erase(R.first); } bool HaveSGPRToMemory = false; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1705,7 +1705,7 @@ LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index); + ArrayRef VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1822,7 +1822,7 @@ LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index); + ArrayRef VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -9,8 +9,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s33, 2 +; CHECK-NEXT: v_writelane_b32 v41, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 @@ -27,9 +28,10 @@ ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: v_readlane_b32 s33, v40, 2 +; CHECK-NEXT: v_readlane_b32 s33, v41, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -235,8 +235,9 @@ ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 +; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 @@ -256,9 +257,10 @@ ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: v_readlane_b32 s33, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s33, v41, 0 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -268,8 +270,9 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 @@ -289,9 +292,10 @@ ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -305,10 +309,11 @@ ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 +; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 @@ -373,9 +378,10 @@ ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: v_readlane_b32 s33, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s33, v41, 0 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -385,9 +391,10 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off -; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 @@ -422,9 +429,10 @@ ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -234,13 +234,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 @@ -248,9 +249,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -19,8 +19,9 @@ ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FIXEDABI-NEXT: s_or_saveexec_b64 s[16:17], -1 ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[16:17] -; FIXEDABI-NEXT: v_writelane_b32 v40, s33, 2 +; FIXEDABI-NEXT: v_writelane_b32 v41, s33, 0 ; FIXEDABI-NEXT: s_mov_b32 s33, s32 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 @@ -32,9 +33,10 @@ ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 ; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 -; FIXEDABI-NEXT: v_readlane_b32 s33, v40, 2 +; FIXEDABI-NEXT: v_readlane_b32 s33, v41, 0 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FIXEDABI-NEXT: s_mov_b64 exec, s[4:5] ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -172,7 +172,7 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: no_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x25{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: no_stack_extern_call: @@ -180,19 +180,19 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -202,7 +202,7 @@ ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x29{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_stack: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x21{{$}} @@ -210,7 +210,7 @@ ; GCN-NEXT: .vgpr_count: 0x2{{$}} ; GCN-NEXT: simple_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x25{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x4{{$}} ; GCN-NEXT: simple_stack_extern_call: @@ -218,16 +218,16 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} -; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .stack_frame_size_in_bytes: 0x30{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2a{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1420,10 +1420,10 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v2, s33, 2 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v2, s30, 0 ; GCN-NEXT: v_writelane_b32 v2, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -1438,7 +1438,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v2, 1 ; GCN-NEXT: v_readlane_b32 s30, v2, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v2, 2 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1451,7 +1451,7 @@ ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v2, s33, 2 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] @@ -1468,7 +1468,7 @@ ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: v_readlane_b32 s33, v2, 2 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1481,7 +1481,7 @@ ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v2, s33, 2 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] @@ -1498,7 +1498,7 @@ ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: v_readlane_b32 s33, v2, 2 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1511,7 +1511,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -1527,7 +1527,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -1542,14 +1542,14 @@ ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1558,7 +1558,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -1578,10 +1578,10 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v3, s33, 2 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v3, s30, 0 ; GCN-NEXT: v_writelane_b32 v3, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -1600,7 +1600,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v3, 1 ; GCN-NEXT: v_readlane_b32 s30, v3, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v3, 2 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1613,7 +1613,7 @@ ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v3, s33, 2 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] @@ -1634,7 +1634,7 @@ ; GFX7-NEXT: v_readlane_b32 s31, v3, 1 ; GFX7-NEXT: v_readlane_b32 s30, v3, 0 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: v_readlane_b32 s33, v3, 2 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1647,7 +1647,7 @@ ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v2, s33, 2 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] @@ -1663,7 +1663,7 @@ ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: v_readlane_b32 s33, v2, 2 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1676,7 +1676,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -1692,7 +1692,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -1707,14 +1707,14 @@ ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1723,7 +1723,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -1743,10 +1743,10 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v4, s33, 2 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v4, s30, 0 ; GCN-NEXT: v_writelane_b32 v4, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -1766,7 +1766,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v4, 1 ; GCN-NEXT: v_readlane_b32 s30, v4, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v4, 2 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1779,7 +1779,7 @@ ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v4, s33, 2 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] @@ -1801,7 +1801,7 @@ ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: v_readlane_b32 s33, v4, 2 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1814,7 +1814,7 @@ ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v3, s33, 2 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] @@ -1834,7 +1834,7 @@ ; GFX8-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: v_readlane_b32 s33, v3, 2 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1847,7 +1847,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 @@ -1869,7 +1869,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -1884,7 +1884,7 @@ ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v3, s33, 2 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] @@ -1905,7 +1905,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v3, 2 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -1925,10 +1925,10 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v5, s33, 2 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v5, s30, 0 ; GCN-NEXT: v_writelane_b32 v5, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -1955,7 +1955,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v5, 1 ; GCN-NEXT: v_readlane_b32 s30, v5, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v5, 2 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1968,7 +1968,7 @@ ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v5, s33, 2 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] @@ -1997,7 +1997,7 @@ ; GFX7-NEXT: v_readlane_b32 s31, v5, 1 ; GFX7-NEXT: v_readlane_b32 s30, v5, 0 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: v_readlane_b32 s33, v5, 2 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -2010,7 +2010,7 @@ ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v3, s33, 2 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] @@ -2037,7 +2037,7 @@ ; GFX8-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: v_readlane_b32 s33, v3, 2 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -2050,7 +2050,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -2072,7 +2072,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -2087,14 +2087,14 @@ ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v3, s33, 2 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2109,7 +2109,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v3, 2 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -2129,10 +2129,10 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v9, s33, 2 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v9, s30, 0 ; GCN-NEXT: v_writelane_b32 v9, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -2175,7 +2175,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v9, 1 ; GCN-NEXT: v_readlane_b32 s30, v9, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v9, 2 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -2188,7 +2188,7 @@ ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v9, s33, 2 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] @@ -2233,7 +2233,7 @@ ; GFX7-NEXT: v_readlane_b32 s31, v9, 1 ; GFX7-NEXT: v_readlane_b32 s30, v9, 0 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: v_readlane_b32 s33, v9, 2 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -2246,7 +2246,7 @@ ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v5, s33, 2 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] @@ -2287,7 +2287,7 @@ ; GFX8-NEXT: v_readlane_b32 s31, v5, 1 ; GFX8-NEXT: v_readlane_b32 s30, v5, 0 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: v_readlane_b32 s33, v5, 2 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -2300,7 +2300,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v5, s33, 2 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -2330,7 +2330,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v5, 1 ; GFX9-NEXT: v_readlane_b32 s30, v5, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v5, 2 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -2345,14 +2345,14 @@ ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v5, s33, 2 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2375,7 +2375,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v5, 1 ; GFX10-NEXT: v_readlane_b32 s30, v5, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v5, 2 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -2395,10 +2395,10 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v17, s33, 2 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v17, s30, 0 ; GCN-NEXT: v_writelane_b32 v17, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -2473,7 +2473,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v17, 1 ; GCN-NEXT: v_readlane_b32 s30, v17, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v17, 2 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -2486,7 +2486,7 @@ ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v17, s33, 2 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] @@ -2563,7 +2563,7 @@ ; GFX7-NEXT: v_readlane_b32 s31, v17, 1 ; GFX7-NEXT: v_readlane_b32 s30, v17, 0 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: v_readlane_b32 s33, v17, 2 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -2576,7 +2576,7 @@ ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v9, s33, 2 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] @@ -2645,7 +2645,7 @@ ; GFX8-NEXT: v_readlane_b32 s31, v9, 1 ; GFX8-NEXT: v_readlane_b32 s30, v9, 0 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: v_readlane_b32 s33, v9, 2 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -2658,7 +2658,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v9, s33, 2 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -2704,7 +2704,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v9, 1 ; GFX9-NEXT: v_readlane_b32 s30, v9, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v9, 2 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -2719,14 +2719,14 @@ ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v9, s33, 2 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2765,7 +2765,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v9, 1 ; GFX10-NEXT: v_readlane_b32 s30, v9, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v9, 2 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -14,16 +14,16 @@ } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v40, s33, 2 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s33, v40, 2 +; GCN: v_readlane_b32 s33, v41, 0 ; GCN: s_setpc_b64 s[30:31] ; GCN: ; NumSgprs: 36 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_vcc() #1 { call void @use_vcc() ret void @@ -34,7 +34,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() ret void @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}indirect_use_flat_scratch: ; CI: ; NumSgprs: 38 ; VI: ; NumSgprs: 40 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() ret void @@ -63,7 +63,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_flat_scratch() ret void @@ -78,7 +78,7 @@ } ; GCN-LABEL: {{^}}indirect_use_10_vgpr: -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_10_vgpr() #0 { call void @use_10_vgpr() ret void @@ -86,7 +86,7 @@ ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: ; GCN: is_dynamic_callstack = 0 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { call void @indirect_use_10_vgpr() ret void diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -23,10 +23,12 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; MUBUF: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, s33, 4 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s34, 2 ; GCN: v_writelane_b32 v40, s35, 3 @@ -41,8 +43,10 @@ ; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 ; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s33, v40, 4 +; GCN: v_readlane_b32 s33, v41, 0 ; MUBUF: buffer_load_dword +; MUBUF: buffer_load_dword +; FLATSCR: scratch_load_dword ; FLATSCR: scratch_load_dword ; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { @@ -54,8 +58,10 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: ; MUBUF: buffer_store_dword v40 +; MUBUF: buffer_store_dword v41 ; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, s33, 4 +; FLATSCR: scratch_store_dword off, v41 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x400 @@ -63,9 +69,11 @@ ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s33, v40, 4 +; GCN: v_readlane_b32 s33, v41, 0 ; MUBUF: buffer_load_dword v40 +; MUBUF: buffer_load_dword v41 ; FLATSCR: scratch_load_dword v40 +; FLATSCR: scratch_load_dword v41 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -87,9 +87,11 @@ ; GCN-NEXT: s_waitcnt ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN: v_writelane_b32 [[CSR_VGPR_1]], s33, 0 ; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} ; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} @@ -107,10 +109,12 @@ ; MUBUF: s_addk_i32 s32, 0xfc00{{$}} ; FLATSCR: s_add_i32 s32, s32, -16{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -132,11 +136,13 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: s_addk_i32 s32, 0x400 ; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]] +; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], s33, [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 @@ -147,10 +153,12 @@ ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -269,8 +277,8 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v0, s33, 63 ; GCN-COUNT-60: v_writelane_b32 v0 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; GCN: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -285,7 +293,7 @@ ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, 16 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN-NEXT: v_readlane_b32 s33, v0, 63 +; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload @@ -389,7 +397,7 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 @@ -404,7 +412,7 @@ ; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; MUBUF: s_addk_i32 s32, 0xfd00 ; FLATSCR: s_add_i32 s32, s32, -12 -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload @@ -432,7 +440,7 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x300{{$}} ; FLATSCR: s_add_i32 s32, s32, 12{{$}} @@ -443,7 +451,7 @@ ; GCN: ;;#ASMSTART ; MUBUF: s_addk_i32 s32, 0xfd00{{$}} ; FLATSCR: s_add_i32 s32, s32, -12{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload @@ -480,7 +488,7 @@ ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}} ; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}} @@ -490,7 +498,7 @@ ; GCN: ;;#ASMSTART ; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}} ; FLATSCR: s_addk_i32 s32, 0xeff4{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload @@ -530,7 +538,7 @@ ; An FP is needed, despite not needing any spills ; TODO: Ccould see callee does not use stack and omit FP. ; GCN-LABEL: {{^}}ipra_call_with_stack: -; GCN: v_writelane_b32 v0, s33, 2 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 @@ -539,7 +547,7 @@ ; GCN: s_swappc_b64 ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN: v_readlane_b32 s33, v0, 2 +; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -29,8 +29,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -42,9 +43,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -63,8 +65,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -76,9 +79,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -97,8 +101,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -110,9 +115,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -131,8 +137,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -145,9 +152,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -18,8 +18,8 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s33, 16 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2 @@ -33,6 +33,7 @@ ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_writelane_b32 v42, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13 @@ -91,9 +92,10 @@ ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: v_readlane_b32 s33, v40, 16 +; CHECK-NEXT: v_readlane_b32 s33, v42, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -12,8 +12,9 @@ ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s33, 0 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 @@ -28,9 +29,10 @@ ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 2 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v41, 0 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -11,7 +11,6 @@ ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SDAG-NEXT: s_mov_b64 exec, s[34:35] -; SDAG-NEXT: v_writelane_b32 v40, s33, 28 ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 ; SDAG-NEXT: v_writelane_b32 v40, s6, 2 @@ -32,6 +31,7 @@ ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 +; SDAG-NEXT: s_mov_b32 s36, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 @@ -78,7 +78,7 @@ ; SDAG-NEXT: v_readlane_b32 s5, v40, 1 ; SDAG-NEXT: v_readlane_b32 s4, v40, 0 ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: v_readlane_b32 s33, v40, 28 +; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] @@ -91,7 +91,6 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[34:35] -; GISEL-NEXT: v_writelane_b32 v40, s33, 28 ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 ; GISEL-NEXT: v_writelane_b32 v40, s6, 2 @@ -112,6 +111,7 @@ ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 +; GISEL-NEXT: s_mov_b32 s36, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 @@ -158,7 +158,7 @@ ; GISEL-NEXT: v_readlane_b32 s5, v40, 1 ; GISEL-NEXT: v_readlane_b32 s4, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 28 +; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -99,8 +99,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -114,9 +115,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -127,25 +129,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -156,26 +161,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -186,25 +194,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -219,10 +230,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -236,9 +248,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -249,17 +262,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 @@ -267,9 +281,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -280,17 +296,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 @@ -298,9 +316,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -311,17 +331,18 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 @@ -329,9 +350,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -347,10 +370,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -364,9 +388,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -377,17 +402,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 @@ -395,9 +421,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -408,17 +436,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 @@ -426,9 +456,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -439,17 +471,18 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 @@ -457,9 +490,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -475,8 +510,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -489,9 +525,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -502,24 +539,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -530,25 +570,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -559,24 +603,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -591,10 +638,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -606,9 +654,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -619,25 +668,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -648,26 +700,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -678,25 +734,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -712,10 +771,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -727,9 +787,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -740,25 +801,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -769,26 +833,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -799,25 +867,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -833,8 +904,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -847,9 +919,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -860,24 +933,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -888,25 +964,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -917,24 +997,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -949,10 +1032,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -964,9 +1048,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -977,25 +1062,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1006,26 +1094,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1036,25 +1128,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1070,10 +1165,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1085,9 +1181,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1098,25 +1195,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1127,26 +1227,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1157,25 +1261,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1191,8 +1298,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1205,9 +1313,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1218,24 +1327,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1246,25 +1358,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1275,24 +1391,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1307,8 +1426,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1322,9 +1442,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1335,25 +1456,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1364,25 +1488,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1393,25 +1521,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1426,11 +1557,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1442,9 +1574,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1455,26 +1588,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1485,27 +1621,31 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1516,26 +1656,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1551,8 +1694,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1568,9 +1712,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1581,27 +1726,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1612,26 +1760,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1642,27 +1794,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1677,11 +1832,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1695,9 +1851,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1708,28 +1865,31 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1740,27 +1900,31 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1771,28 +1935,31 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1810,11 +1977,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1830,9 +1998,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1843,30 +2012,33 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 4 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1877,28 +2049,32 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1909,30 +2085,33 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1949,8 +2128,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1963,9 +2143,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1976,24 +2157,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2004,25 +2188,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2033,24 +2221,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2065,8 +2256,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2079,9 +2271,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2092,24 +2285,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2120,25 +2316,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2149,24 +2349,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2181,8 +2384,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2196,9 +2400,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2209,25 +2414,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2238,25 +2446,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2267,25 +2479,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2300,8 +2515,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2316,9 +2532,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2329,26 +2546,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2359,26 +2579,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2389,26 +2613,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2423,8 +2650,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2441,9 +2669,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2454,28 +2683,31 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2486,27 +2718,31 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2517,28 +2753,31 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2553,8 +2792,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2568,9 +2808,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2581,25 +2822,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2610,25 +2854,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2639,25 +2887,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2672,8 +2923,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2689,9 +2941,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2702,27 +2955,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2733,26 +2989,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2763,27 +3023,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2798,8 +3061,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2817,9 +3081,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2830,16 +3095,17 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -2850,9 +3116,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2863,13 +3131,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 @@ -2881,9 +3151,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2894,16 +3166,17 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -2914,9 +3187,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2931,9 +3206,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2945,9 +3221,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2958,24 +3235,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2986,25 +3266,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3015,24 +3299,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3048,9 +3335,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3062,9 +3350,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3075,24 +3364,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3103,25 +3395,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3132,24 +3428,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3165,9 +3464,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3179,9 +3479,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3192,24 +3493,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3220,25 +3524,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3249,24 +3557,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3282,8 +3593,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3297,9 +3609,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3310,25 +3623,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3339,25 +3655,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3368,25 +3688,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3401,8 +3724,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3416,9 +3740,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3429,25 +3754,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3458,26 +3786,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3488,25 +3820,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3521,9 +3856,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3535,9 +3871,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3548,24 +3885,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3576,25 +3916,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3605,24 +3949,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3638,8 +3985,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3653,9 +4001,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3666,25 +4015,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3695,26 +4047,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3725,25 +4081,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3758,9 +4117,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3772,9 +4132,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3785,24 +4146,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3813,25 +4177,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3842,24 +4210,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3875,9 +4246,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3889,9 +4261,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3902,24 +4275,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3930,25 +4306,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3959,24 +4339,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3992,8 +4375,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4007,9 +4391,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4020,25 +4405,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4049,25 +4437,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4078,25 +4470,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4111,8 +4506,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4127,9 +4523,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4140,26 +4537,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4170,26 +4570,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4200,26 +4604,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4234,8 +4641,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4251,9 +4659,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4264,27 +4673,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4295,26 +4707,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4325,27 +4741,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4360,9 +4779,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4374,9 +4794,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4387,24 +4808,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4415,25 +4839,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4444,24 +4872,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4477,8 +4908,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4494,9 +4926,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4507,27 +4940,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4538,26 +4974,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4568,27 +5008,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4603,8 +5046,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4621,9 +5065,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4634,28 +5079,31 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4666,27 +5114,31 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4697,28 +5149,31 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4733,10 +5188,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4751,9 +5207,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4764,29 +5221,32 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4797,30 +5257,34 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4831,29 +5295,32 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4870,8 +5337,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4891,9 +5359,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4904,18 +5373,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -4926,9 +5396,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4939,14 +5411,16 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 @@ -4958,9 +5432,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4971,18 +5447,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -4993,9 +5470,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5010,10 +5489,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5030,9 +5510,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5043,11 +5524,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5056,18 +5539,19 @@ ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5078,11 +5562,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5091,19 +5578,20 @@ ; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32 ; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5114,11 +5602,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -5127,18 +5617,19 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5155,10 +5646,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5180,9 +5672,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5193,11 +5686,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5210,18 +5705,19 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5232,11 +5728,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5249,19 +5748,20 @@ ; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5272,11 +5772,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -5289,18 +5791,19 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5317,11 +5820,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] @@ -5345,9 +5849,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5358,11 +5863,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off @@ -5376,20 +5883,21 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5400,11 +5908,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: global_load_b32 v32, v[0:1], off @@ -5418,20 +5929,21 @@ ; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5442,11 +5954,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off @@ -5460,20 +5974,21 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5491,10 +6006,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v43, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -5512,10 +6028,11 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v43, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5526,21 +6043,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v43, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5549,10 +6067,12 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v43, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5563,21 +6083,24 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v43, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 -; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5586,10 +6109,12 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: v_readlane_b32 s33, v43, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5600,21 +6125,22 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5623,10 +6149,12 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v43, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5642,10 +6170,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5660,9 +6189,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5673,29 +6203,32 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] ; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5706,30 +6239,34 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] ; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5740,29 +6277,32 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5779,13 +6319,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 @@ -5796,10 +6337,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5810,17 +6352,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -5828,10 +6371,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5842,17 +6387,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 +; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -5861,10 +6408,12 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5875,17 +6424,18 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -5893,10 +6443,12 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5916,8 +6468,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 @@ -5938,7 +6491,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5946,6 +6499,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5956,23 +6510,24 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 @@ -5980,14 +6535,16 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5998,10 +6555,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -6021,14 +6580,16 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6039,10 +6600,11 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:20 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 @@ -6063,14 +6625,16 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:20 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6098,10 +6662,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6134,9 +6699,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6147,14 +6713,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] @@ -6185,9 +6752,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6198,14 +6767,16 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] @@ -6233,9 +6804,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6246,14 +6819,15 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] @@ -6284,9 +6858,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6304,7 +6880,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 32 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 @@ -6383,7 +6959,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 32 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -6682,8 +7258,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6697,9 +7274,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6710,25 +7288,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6739,26 +7320,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6769,25 +7353,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6802,9 +7389,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -6818,9 +7406,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6831,26 +7420,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6861,17 +7453,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6879,9 +7473,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6892,26 +7488,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6926,9 +7525,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -6942,9 +7542,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6955,26 +7556,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6985,17 +7589,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7003,9 +7609,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7016,26 +7624,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7050,9 +7661,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -7066,9 +7678,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7079,26 +7692,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 42 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7109,17 +7725,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 42 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 42 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7127,9 +7745,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7140,26 +7760,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7174,10 +7797,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7193,9 +7817,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7206,18 +7831,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7226,9 +7852,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7239,18 +7867,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7260,9 +7890,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7273,18 +7905,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7293,9 +7926,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7310,14 +7945,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -7333,9 +7969,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7346,13 +7983,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7370,9 +8008,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7383,13 +8023,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -7408,9 +8050,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7421,13 +8065,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7445,9 +8090,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7463,12 +8110,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -7488,9 +8136,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7501,18 +8150,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7527,9 +8177,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7540,18 +8192,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -7567,9 +8221,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7580,18 +8236,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7606,9 +8263,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7623,8 +8282,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -7633,6 +8292,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 @@ -7652,9 +8312,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 8 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7665,13 +8326,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7695,9 +8357,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7708,13 +8372,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 8 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -7739,9 +8405,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 8 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7752,13 +8420,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7782,9 +8451,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7802,8 +8473,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -7814,6 +8485,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -7837,9 +8509,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7850,13 +8523,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7886,9 +8560,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7899,13 +8575,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -7936,9 +8614,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7949,13 +8629,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7985,9 +8666,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8004,9 +8687,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -8020,9 +8704,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8033,26 +8718,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8063,17 +8751,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x4400 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x4400 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8081,9 +8771,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8094,26 +8786,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8128,9 +8823,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -8144,9 +8840,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8157,26 +8854,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 4.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8187,17 +8887,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 4.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 4.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8205,9 +8907,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8218,26 +8922,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8252,10 +8959,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8271,9 +8979,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8284,18 +8993,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -8304,9 +9014,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8317,18 +9029,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8338,9 +9052,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8351,18 +9067,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8371,9 +9088,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8388,11 +9107,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 @@ -8410,9 +9130,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8423,18 +9144,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -8446,9 +9168,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8459,18 +9183,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 5 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 3 @@ -8483,9 +9209,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 5 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8496,18 +9224,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -8519,9 +9248,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8536,13 +9267,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 @@ -8564,9 +9296,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 7 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8577,18 +9310,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 7 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8606,9 +9340,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8619,18 +9355,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 7 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8649,9 +9387,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 7 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8662,18 +9402,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8691,9 +9432,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8708,10 +9451,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -8727,9 +9471,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8740,18 +9485,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -8760,9 +9506,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8773,18 +9521,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40100000 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40100000 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8794,9 +9544,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8807,18 +9559,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8827,9 +9580,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8844,12 +9599,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -8869,9 +9625,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8882,18 +9639,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8908,9 +9666,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8921,18 +9681,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8948,9 +9710,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8961,18 +9725,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8987,9 +9752,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9004,14 +9771,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 @@ -9035,9 +9803,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 8 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9048,18 +9817,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9080,9 +9850,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9093,18 +9865,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 8 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -9126,9 +9900,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 8 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9139,18 +9915,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9171,9 +9948,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9188,10 +9967,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -9204,9 +9984,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9217,26 +9998,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9247,17 +10031,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9265,9 +10051,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9278,26 +10066,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9313,11 +10104,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -9331,9 +10123,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9344,12 +10137,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -9363,9 +10157,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9376,12 +10172,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -9396,9 +10194,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9409,12 +10209,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -9428,9 +10229,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9446,11 +10249,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -9464,9 +10268,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9477,12 +10282,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -9496,9 +10302,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9509,12 +10317,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -9529,9 +10339,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9542,12 +10354,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -9561,9 +10374,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9579,10 +10394,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -9598,9 +10414,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9611,18 +10428,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -9631,9 +10449,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9644,18 +10464,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 3 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 3 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9665,9 +10487,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9678,18 +10502,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9698,9 +10523,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9715,10 +10542,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -9734,9 +10562,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9747,18 +10576,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -9767,9 +10597,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9780,18 +10612,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_movk_i32 s5, 0x4400 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_movk_i32 s5, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9801,9 +10635,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9814,18 +10650,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9834,9 +10671,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9851,11 +10690,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -9869,9 +10709,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9882,12 +10723,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -9901,9 +10743,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9914,12 +10758,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -9934,9 +10780,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9947,12 +10795,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -9966,9 +10815,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9984,10 +10835,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -10003,9 +10855,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10016,18 +10869,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -10036,9 +10890,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10049,18 +10905,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40003 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40003 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10070,9 +10928,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10083,18 +10943,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10103,9 +10964,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10120,10 +10983,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -10136,9 +11000,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10149,26 +11014,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10179,17 +11047,19 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10197,9 +11067,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10210,26 +11082,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10245,11 +11120,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -10263,9 +11139,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10276,12 +11153,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10295,9 +11173,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10308,12 +11188,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10328,9 +11210,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10341,12 +11225,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10360,9 +11245,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10378,10 +11265,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -10397,9 +11285,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10410,18 +11299,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -10430,9 +11320,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10443,18 +11335,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10464,9 +11358,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10477,18 +11373,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10497,9 +11394,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10514,11 +11413,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 @@ -10536,9 +11436,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10549,18 +11450,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -10572,9 +11474,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10585,18 +11489,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 5 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 5 ; GFX11-NEXT: v_writelane_b32 v40, s30, 3 @@ -10609,9 +11515,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 5 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10622,18 +11530,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -10645,9 +11554,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10662,12 +11573,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -10687,9 +11599,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10700,18 +11613,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -10726,9 +11640,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10739,18 +11655,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 5 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -10766,9 +11684,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10779,18 +11699,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -10805,9 +11726,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10822,13 +11745,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -10844,9 +11768,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10857,12 +11782,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -10880,9 +11806,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10893,12 +11821,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -10917,9 +11847,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10930,12 +11862,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -10953,9 +11886,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10971,12 +11906,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -10996,9 +11932,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11009,18 +11946,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11035,9 +11973,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11048,18 +11988,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11075,9 +12017,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11088,18 +12032,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11114,9 +12059,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -11131,13 +12078,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 @@ -11159,9 +12107,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 7 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11172,18 +12121,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11201,9 +12151,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11214,18 +12166,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 7 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11244,9 +12198,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 7 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11257,18 +12213,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11286,9 +12243,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -11303,8 +12262,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11316,6 +12275,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -11335,9 +12295,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11348,13 +12309,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11381,9 +12343,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11394,13 +12358,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11428,9 +12394,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11441,13 +12409,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11474,9 +12443,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -11493,8 +12464,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11503,6 +12474,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -11530,9 +12502,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11543,18 +12516,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11581,9 +12555,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11594,18 +12570,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11633,9 +12611,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11646,18 +12626,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11684,9 +12665,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -11701,8 +12684,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 18 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11722,6 +12705,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 @@ -11749,9 +12733,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 18 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11762,13 +12747,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11811,9 +12797,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11824,13 +12812,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 18 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11874,9 +12864,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 18 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11887,13 +12879,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11936,9 +12929,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -11955,8 +12950,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11985,6 +12980,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 @@ -12047,9 +13043,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 28 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12060,13 +13057,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12154,9 +13152,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12167,13 +13167,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 28 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12256,9 +13258,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 28 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12269,13 +13273,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12359,9 +13364,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -12378,8 +13385,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12409,6 +13416,7 @@ ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 @@ -12475,9 +13483,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 28 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12488,13 +13497,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12587,9 +13597,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12600,13 +13612,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 28 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12692,9 +13706,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 28 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12705,13 +13721,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12800,9 +13817,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -12820,12 +13839,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -12838,10 +13858,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12852,15 +13873,16 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 @@ -12872,10 +13894,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12886,26 +13910,30 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 +; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12916,26 +13944,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -12951,8 +13982,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 12 @@ -13004,9 +14036,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13017,12 +14050,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: v_mov_b32_e32 v2, 14 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 @@ -13071,9 +14105,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13084,18 +14120,20 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 ; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2 ; GFX11-NEXT: v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3 ; GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3 @@ -13118,9 +14156,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13131,22 +14171,23 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2 @@ -13182,9 +14223,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -13212,8 +14255,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 @@ -13273,9 +14317,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13286,12 +14331,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -13348,9 +14394,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13361,13 +14409,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 ; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 ; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 @@ -13399,9 +14449,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13412,9 +14464,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 @@ -13423,6 +14475,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -13468,9 +14521,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -13494,8 +14549,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 @@ -13555,9 +14611,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13568,12 +14625,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -13630,9 +14688,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13643,9 +14703,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000 @@ -13654,6 +14715,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000 ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 @@ -13687,9 +14749,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13700,9 +14764,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000 @@ -13711,6 +14775,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -13756,9 +14821,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -11,10 +11,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -31,9 +32,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -44,12 +46,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -65,9 +68,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -78,12 +83,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_getpc_b64 s[4:5] ; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -100,9 +107,11 @@ ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -213,9 +222,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -236,9 +246,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -249,15 +260,16 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: ;;#ASMSTART @@ -273,9 +285,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -286,15 +300,17 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: ;;#ASMSTART @@ -310,9 +326,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -328,8 +346,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -351,9 +370,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -364,21 +384,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v31 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART @@ -388,9 +409,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -401,21 +424,24 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, v31 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v31, v41 ; GFX11-NEXT: ;;#ASMSTART @@ -425,9 +451,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -444,9 +472,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -467,9 +496,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -480,20 +510,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 @@ -504,9 +535,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -517,20 +550,22 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s33 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s33, s4 @@ -542,9 +577,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -560,9 +597,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -583,9 +621,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -596,20 +635,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_mov_b32 s34, s4 @@ -620,9 +660,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -633,20 +675,22 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s34 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s34 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s34, s4 @@ -658,9 +702,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -676,8 +722,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 2 +; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 @@ -697,9 +744,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -710,20 +758,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 @@ -732,9 +781,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -745,20 +796,23 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 2 +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v41, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v40 @@ -767,9 +821,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 2 +; GFX11-NEXT: v_readlane_b32 s33, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -909,8 +965,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -922,9 +979,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -935,23 +993,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s33@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -962,24 +1023,28 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s33@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -993,8 +1058,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1006,9 +1072,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1019,23 +1086,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s34@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1046,24 +1116,28 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s34@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1077,9 +1151,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -1099,9 +1174,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1112,20 +1188,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1135,9 +1212,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1148,20 +1227,22 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s40 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1172,9 +1253,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1190,9 +1273,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -1221,9 +1305,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1234,26 +1319,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v42, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_mov_b32_e32 v41, v32 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 @@ -1266,9 +1352,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1279,26 +1367,29 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v42, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: v_mov_b32_e32 v41, v32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s4 @@ -1311,9 +1402,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: v_readlane_b32 s33, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -27,7 +27,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v1, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -41,7 +41,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v1, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -56,21 +56,21 @@ ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v1, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -85,14 +85,14 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s33, 2 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i1@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_i1@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -100,7 +100,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v1, 2 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 @@ -135,7 +135,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v1, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -149,7 +149,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v1, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -164,21 +164,21 @@ ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v1, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -193,14 +193,14 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s33, 2 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_i16@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -208,7 +208,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v1, 2 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 @@ -243,7 +243,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v1, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -257,7 +257,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v1, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -272,21 +272,21 @@ ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v1, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -301,14 +301,14 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s33, 2 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_2xi16@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -316,7 +316,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v1, 2 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 @@ -360,7 +360,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -374,7 +374,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -389,21 +389,21 @@ ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -418,14 +418,14 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v2, s33, 2 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_3xi16@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -433,7 +433,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v2, 2 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 @@ -1641,7 +1641,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000 ; GFX9-NEXT: s_add_i32 s32, s32, 0x60000 @@ -1657,7 +1657,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 -; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -1672,7 +1672,7 @@ ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 ; GFX10-NEXT: s_add_i32 s32, s32, 0x30000 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 @@ -1688,7 +1688,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 -; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -1703,7 +1703,7 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v5, s32 offset:2048 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v5, s33, 2 +; GFX11-NEXT: s_mov_b32 s34, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x7ff ; GFX11-NEXT: s_addk_i32 s32, 0x1800 ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffff800 @@ -1720,7 +1720,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xe800 -; GFX11-NEXT: v_readlane_b32 s33, v5, 2 +; GFX11-NEXT: s_mov_b32 s33, s34 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:2048 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -395,8 +395,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 18 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -465,9 +466,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 18 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -477,8 +479,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 18 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -547,9 +550,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 18 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -563,8 +567,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 18 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -636,9 +641,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 18 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -648,8 +654,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 18 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -719,9 +726,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 18 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -735,8 +743,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 18 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -807,9 +816,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 18 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -819,8 +829,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 18 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -891,9 +902,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 18 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -908,8 +920,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 20 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -989,9 +1002,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 20 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +1015,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 20 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1082,9 +1097,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 20 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1106,7 +1122,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1187,7 +1203,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1200,7 +1216,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s5, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1281,7 +1297,7 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s5 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] @@ -1298,7 +1314,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1383,7 +1399,7 @@ ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1396,7 +1412,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1481,7 +1497,7 @@ ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] @@ -1502,7 +1518,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1585,7 +1601,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1598,7 +1614,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1681,7 +1697,7 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] @@ -1699,7 +1715,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1779,7 +1795,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1792,7 +1808,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1872,7 +1888,7 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -190,10 +190,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 5 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v44, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 @@ -229,9 +230,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: v_readlane_b32 s33, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v1, s33, 2 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -51,7 +51,7 @@ ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v1, 2 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] @@ -155,7 +155,7 @@ ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v1, s33, 2 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 @@ -171,7 +171,7 @@ ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v1, 2 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] @@ -189,7 +189,7 @@ ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v2, s33, 2 +; CHECK-NEXT: s_mov_b32 s7, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v2, s30, 0 @@ -205,7 +205,7 @@ ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v2, 2 +; CHECK-NEXT: s_mov_b32 s33, s7 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -13,8 +13,9 @@ ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v40, s33, 2 +; GCN-DAG: v_writelane_b32 v41, s33, 0 ; GCN-DAG: s_mov_b32 s33, s32 ; GCN-DAG: s_addk_i32 s32, 0x400 ; GCN-DAG: v_writelane_b32 v40, s30, 0 @@ -26,9 +27,10 @@ ; GCN: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -15,8 +15,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s33, 2 +; CHECK-NEXT: v_writelane_b32 v41, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 @@ -39,9 +40,10 @@ ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v40, 2 +; CHECK-NEXT: v_readlane_b32 s33, v41, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -24,7 +24,7 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX8-LABEL: name: pei_scavenge_vgpr_spill - ; GFX8: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2 + ; GFX8: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc @@ -49,7 +49,7 @@ ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill - ; GFX9: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2 + ; GFX9: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc @@ -72,7 +72,7 @@ ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill - ; GFX9-FLATSCR: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2 + ; GFX9-FLATSCR: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/save-fp.ll b/llvm/test/CodeGen/AMDGPU/save-fp.ll --- a/llvm/test/CodeGen/AMDGPU/save-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/save-fp.ll @@ -11,14 +11,14 @@ ; GCN-LABEL: {{^}}caller: -; GCN: v_writelane_b32 v2, s33, 2 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; GFX900: buffer_store_dword ; GFX908-DAG: v_accvgpr_write_b32 ; GCN: s_swappc_b64 ; GFX900: buffer_load_dword ; GFX908: v_accvgpr_read_b32 -; GCN: v_readlane_b32 s33, v2, 2 +; GCN: s_mov_b32 s33, [[TMP_SGPR]] define i64 @caller() { bb: call void asm sideeffect "", "~{v40}" () diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -19,7 +19,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v255, s33, 2 +; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -264,7 +264,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: v_readlane_b32 s33, v255, 2 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -311,7 +311,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v254, s33, 2 +; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -554,7 +554,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: v_readlane_b32 s33, v254, 2 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -200,9 +200,10 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 [[CSRV]], s33, 2 -; GCN-DAG: s_addk_i32 s32, 0x400 +; GCN: v_writelane_b32 [[CSRV_1]], s33, 0 +; GCN-DAG: s_addk_i32 s32, 0x800 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -226,10 +227,11 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1 -; GCN: s_addk_i32 s32, 0xfc00 +; GCN: s_addk_i32 s32, 0xf800 ; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -3,16 +3,18 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, s33, 3 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v40, 3 +; GCN: v_readlane_b32 s33, v41, 0 ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -157,12 +157,13 @@ ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: ; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 +; GCN-NEXT: v_writelane_b32 [[VGPR_REG_1]], s33, 0 ; GCN-DAG: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 ; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -174,10 +175,11 @@ ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 ; GCN: s_add_i32 s32, s32, 0xfffd0000 -; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 +; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG_1]], 0 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -20,8 +20,8 @@ ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_writelane_b32 v1, s33, 3 ; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: s_mov_b32 s36, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v1, s30, 1 @@ -36,7 +36,7 @@ ; GCN-NEXT: v_readlane_b32 s30, v1, 1 ; GCN-NEXT: v_readlane_b32 s4, v1, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v1, 3 +; GCN-NEXT: s_mov_b32 s33, s36 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -10,8 +10,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[16:17] -; GFX90A-NEXT: v_writelane_b32 v40, s33, 2 +; GFX90A-NEXT: v_writelane_b32 v41, s33, 0 ; GFX90A-NEXT: s_mov_b32 s33, s32 ; GFX90A-NEXT: s_addk_i32 s32, 0x400 ; GFX90A-NEXT: s_getpc_b64 s[16:17] @@ -33,12 +34,12 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, v0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v44, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v44, off +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v44, s[36:37] +; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[36:37] ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 @@ -46,10 +47,10 @@ ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, 0x40994400 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1 @@ -76,20 +77,20 @@ ; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 1 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 1 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 2 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 3 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 2 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 3 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4 ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 5 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 5 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 @@ -137,10 +138,10 @@ ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v43, v[0:1] +; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS1-NEXT: flat_load_dword v45, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] @@ -148,7 +149,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] @@ -218,19 +219,19 @@ ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v41, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v41, 1 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc ; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] ; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) @@ -279,9 +280,9 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] @@ -289,15 +290,15 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], a[32:33], off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], a[32:33], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -359,21 +360,21 @@ ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 3 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -388,7 +389,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -406,7 +407,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -418,12 +419,12 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, v0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v44, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v44, off +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v44, s[36:37] +; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[36:37] ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 @@ -431,10 +432,10 @@ ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, 0x40994400 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1 @@ -461,20 +462,20 @@ ; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 1 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 3 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4 ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 @@ -522,10 +523,10 @@ ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v43, v[0:1] +; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS0-NEXT: flat_load_dword v45, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] @@ -533,7 +534,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] @@ -603,19 +604,19 @@ ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc ; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] ; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) @@ -664,9 +665,9 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] @@ -674,15 +675,15 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], a[32:33], off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], a[32:33], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -744,21 +745,21 @@ ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 3 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -773,7 +774,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -791,7 +792,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -8,8 +8,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 16 +; GCN-NEXT: v_writelane_b32 v42, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -119,9 +120,10 @@ ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 16 +; GCN-NEXT: v_readlane_b32 s33, v42, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -263,8 +265,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 18 +; GCN-NEXT: v_writelane_b32 v46, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x800 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -13,8 +13,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v45, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -54,9 +55,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v45, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -67,6 +69,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v36, v16 @@ -74,7 +77,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v45, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill @@ -111,9 +114,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v45, 0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -124,12 +129,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v45, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 @@ -164,9 +171,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v45, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -200,8 +209,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v46, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill @@ -235,9 +245,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v46, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -248,9 +259,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v46, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill @@ -285,9 +297,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v46, 0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -298,9 +312,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_writelane_b32 v46, s33, 0 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 @@ -333,9 +349,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: v_readlane_b32 s33, v46, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1112,11 +1112,12 @@ ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN: s_mov_b32 s33, s32 ; GFX1064: s_addk_i32 s32, 0x400 ; GFX1032: s_addk_i32 s32, 0x200 @@ -1131,10 +1132,12 @@ ; GFX1064: s_addk_i32 s32, 0xfc00 ; GFX1032: s_addk_i32 s32, 0xfe00 -; GCN: v_readlane_b32 s33, v40, 2 +; GCN: v_readlane_b32 s33, v41, 0 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -337,7 +337,7 @@ ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-O0-NEXT: s_mov_b32 s35, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 @@ -371,7 +371,7 @@ ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-O0-NEXT: s_mov_b32 s33, s35 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -388,8 +388,8 @@ ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v3, s33, 2 ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 @@ -411,7 +411,7 @@ ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-O3-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -532,7 +532,7 @@ ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_writelane_b32 v10, s33, 8 +; GFX9-O0-NEXT: s_mov_b32 s42, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 @@ -599,7 +599,7 @@ ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: v_readlane_b32 s33, v10, 8 +; GFX9-O0-NEXT: s_mov_b32 s33, s42 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 @@ -639,8 +639,8 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v8, s33, 2 ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 +; GFX9-O3-NEXT: s_mov_b32 s40, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 @@ -672,7 +672,7 @@ ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: v_readlane_b32 s33, v8, 2 +; GFX9-O3-NEXT: s_mov_b32 s33, s40 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0