diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -69,25 +69,7 @@ // We need to save and restore the current FP/BP. - // 1: If there is already a VGPR with free lanes, use it. We - // may already have to pay the penalty for spilling a CSR VGPR. - if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { - int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, - TargetStackID::SGPRSpill); - - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - - FrameIndex = NewFI; - - LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane - << '\n'); - return; - } - - // 2: Next, try to save the FP/BP in an unused SGPR. + // 1: Try to save the FP/BP in an unused SGPR. TempSGPR = findScratchNonCalleeSaveRegister( MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); @@ -95,21 +77,20 @@ int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, TargetStackID::SGPRSpill); - if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { - // 3: There's no free lane to spill, and no free register to save FP/BP, + if (TRI->spillSGPRToVGPR() && + MFI->allocateSGPRSpillToVGPR(MF, NewFI, /* IsPEI */ true)) { + // 2: There's no free lane to spill, and no free register to save FP/BP, // so we're forced to spill another VGPR to use for the spill. - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - MFI->addToWWMVGPRs(MF, Spill.VGPR); - FrameIndex = NewFI; LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRCustomSpills(NewFI).front(); dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); } else { // Remove dead index MF.getFrameInfo().RemoveStackObject(NewFI); - // 4: If all else fails, spill the FP/BP to memory. + // 3: If all else fails, spill the FP/BP to memory. FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " << (IsFP ? "FP" : "BP") << '\n'); @@ -823,7 +804,7 @@ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRToVGPRSpills(FI); + FuncInfo->getSGPRToVGPRCustomSpills(FI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) @@ -1012,7 +993,7 @@ auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRToVGPRSpills(FI); + FuncInfo->getSGPRToVGPRCustomSpills(FI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg) .addReg(Spill[0].VGPR) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -462,7 +462,9 @@ // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. DenseMap> SGPRToVGPRSpills; + DenseMap> SGPRToVGPRCustomSpills; unsigned NumVGPRSpillLanes = 0; + unsigned NumVGPRCustomSpillLanes = 0; SmallVector SpillVGPRs; using WWMVGPRsMap = MapVector>; // To track the registers used in instructions that can potentially modify the @@ -541,6 +543,14 @@ const WWMVGPRsMap &getWWMVGPRs() const { return WWMVGPRs; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } + ArrayRef + getSGPRToVGPRCustomSpills(int FrameIndex) const { + auto I = SGPRToVGPRCustomSpills.find(FrameIndex); + return (I == SGPRToVGPRCustomSpills.end()) + ? ArrayRef() + : makeArrayRef(I->second); + } + void addToWWMVGPRs(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -564,9 +574,11 @@ I->second.IsDead = true; } - bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, - unsigned NumLane) const; - bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); + bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocateVGPRForCustomSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI, bool IsPEI = false); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -284,21 +284,78 @@ return false; } -/// \p returns true if \p NumLanes slots are available in VGPRs already used for -/// SGPR spilling. -// -// FIXME: This only works after processFunctionBeforeFrameFinalized -bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, - unsigned NumNeed) const { +bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, + int FI, + unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget(); - unsigned WaveSize = ST.getWavefrontSize(); - return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register LaneVGPR; + if (!LaneIndex) { + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we will not + // partially spill the SGPR to VGPRs. + SGPRToVGPRSpills.erase(FI); + + // FIXME: We can run out of free registers with split allocation if + // IPRA is enabled and a called function already uses every VGPR. +#if 0 + DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), + "VGPRs for SGPR spilling", + 0, DS_Error); + MF.getFunction().getContext().diagnose(DiagOutOfRegs); +#endif + return false; + } + + SpillVGPRs.push_back(LaneVGPR); + // Add this register as live-in to all blocks to avoid machine verifier + // complaining about use of an undefined physical register. + for (MachineBasicBlock &BB : MF) + BB.addLiveIn(LaneVGPR); + } else { + LaneVGPR = SpillVGPRs.back(); + } + + SGPRToVGPRSpills[FI].push_back( + SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); + return true; +} + +bool SIMachineFunctionInfo::allocateVGPRForCustomSGPRSpills( + MachineFunction &MF, int FI, unsigned LaneIndex) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register LaneVGPR; + if (!LaneIndex) { + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we will not + // partially spill the SGPR to VGPRs. + SGPRToVGPRCustomSpills.erase(FI); + return false; + } + + addToWWMVGPRs(MF, LaneVGPR); + for (MachineBasicBlock &MBB : MF) { + MBB.addLiveIn(LaneVGPR); + MBB.sortUniqueLiveIns(); + } + } else { + LaneVGPR = WWMVGPRs.back().first; + } + + SGPRToVGPRCustomSpills[FI].push_back( + SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); + return true; } -/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. -bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, - int FI) { - std::vector &SpillLanes = SGPRToVGPRSpills[FI]; +bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI, + bool IsPEI) { + std::vector &SpillLanes = + IsPEI ? SGPRToVGPRCustomSpills[FI] : SGPRToVGPRSpills[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -307,7 +364,6 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned WaveSize = ST.getWavefrontSize(); unsigned Size = FrameInfo.getObjectSize(FI); @@ -319,42 +375,17 @@ assert(Size >= 4 && "invalid sgpr spill size"); assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); - // Make sure to handle the case where a wide SGPR spill may span between two - // VGPRs. - for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { - Register LaneVGPR; - unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); - - if (VGPRIndex == 0) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRToVGPRSpills.erase(FI); - NumVGPRSpillLanes -= I; - - // FIXME: We can run out of free registers with split allocation if - // IPRA is enabled and a called function already uses every VGPR. -#if 0 - DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), - "VGPRs for SGPR spilling", - 0, DS_Error); - MF.getFunction().getContext().diagnose(DiagOutOfRegs); -#endif - return false; - } + unsigned &NumSpillLanes = IsPEI ? NumVGPRCustomSpillLanes : NumVGPRSpillLanes; - SpillVGPRs.push_back(LaneVGPR); + for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { + unsigned LaneIndex = (NumSpillLanes % WaveSize); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); - } else { - LaneVGPR = SpillVGPRs.back(); + bool Allocated = IsPEI ? allocateVGPRForCustomSGPRSpills(MF, FI, LaneIndex) + : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); + if (!Allocated) { + NumSpillLanes -= I; + return false; } - - SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex)); } return true; @@ -430,16 +461,14 @@ bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame, however keep FP & BP since - // spills for them haven't been inserted yet. And also make sure to remove the - // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could - // result in an unexpected side effect and bug, in case of any re-mapping of - // freed frame indices by later pass(es) like "stack slot coloring". + // Remove dead frame indices from function frame. And also make sure to remove + // the frame indices from `SGPRToVGPRSpills` data structure, otherwise, it + // could result in an unexpected side effect and bug, in case of any + // re-mapping of freed frame indices by later pass(es) like "stack slot + // coloring". for (auto &R : make_early_inc_range(SGPRToVGPRSpills)) { - if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) { - MFI.RemoveStackObject(R.first); - SGPRToVGPRSpills.erase(R.first); - } + MFI.RemoveStackObject(R.first); + SGPRToVGPRSpills.erase(R.first); } bool HaveSGPRToMemory = false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -9,8 +9,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s33, 2 +; CHECK-NEXT: v_writelane_b32 v41, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 @@ -27,9 +28,10 @@ ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: v_readlane_b32 s33, v40, 2 +; CHECK-NEXT: v_readlane_b32 s33, v41, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -238,8 +238,9 @@ ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 +; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 @@ -259,9 +260,10 @@ ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: v_readlane_b32 s33, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s33, v41, 0 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -271,8 +273,9 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 @@ -292,9 +295,10 @@ ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -308,10 +312,11 @@ ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 +; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 @@ -376,9 +381,10 @@ ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: v_readlane_b32 s33, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s33, v41, 0 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -388,9 +394,10 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off -; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 @@ -425,9 +432,10 @@ ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -232,13 +232,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 @@ -246,9 +247,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -19,8 +19,9 @@ ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FIXEDABI-NEXT: s_or_saveexec_b64 s[16:17], -1 ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[16:17] -; FIXEDABI-NEXT: v_writelane_b32 v40, s33, 2 +; FIXEDABI-NEXT: v_writelane_b32 v41, s33, 0 ; FIXEDABI-NEXT: s_mov_b32 s33, s32 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 @@ -32,9 +33,10 @@ ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 ; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 -; FIXEDABI-NEXT: v_readlane_b32 s33, v40, 2 +; FIXEDABI-NEXT: v_readlane_b32 s33, v41, 0 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FIXEDABI-NEXT: s_mov_b64 exec, s[4:5] ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -172,7 +172,7 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: no_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x25{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: no_stack_extern_call: @@ -180,19 +180,19 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -202,7 +202,7 @@ ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x29{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_stack: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x21{{$}} @@ -210,7 +210,7 @@ ; GCN-NEXT: .vgpr_count: 0x2{{$}} ; GCN-NEXT: simple_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x25{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x4{{$}} ; GCN-NEXT: simple_stack_extern_call: @@ -218,16 +218,16 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} -; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .stack_frame_size_in_bytes: 0x30{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2a{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -13,16 +13,16 @@ } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v40, s33, 2 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s33, v40, 2 +; GCN: v_readlane_b32 s33, v41, 0 ; GCN: s_setpc_b64 s[30:31] ; GCN: ; NumSgprs: 36 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_vcc() #1 { call void @use_vcc() ret void @@ -33,7 +33,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { call void @indirect_use_vcc() ret void @@ -51,7 +51,7 @@ ; GCN-LABEL: {{^}}indirect_use_flat_scratch: ; CI: ; NumSgprs: 38 ; VI: ; NumSgprs: 40 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() ret void @@ -62,7 +62,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { call void @indirect_use_flat_scratch() ret void @@ -77,7 +77,7 @@ } ; GCN-LABEL: {{^}}indirect_use_10_vgpr: -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_10_vgpr() #0 { call void @use_10_vgpr() ret void @@ -85,7 +85,7 @@ ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: ; GCN: is_dynamic_callstack = 0 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { call void @indirect_use_10_vgpr() ret void diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -23,10 +23,12 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; MUBUF: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, s33, 4 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s34, 2 ; GCN: v_writelane_b32 v40, s35, 3 @@ -41,8 +43,10 @@ ; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 ; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s33, v40, 4 +; GCN: v_readlane_b32 s33, v41, 0 ; MUBUF: buffer_load_dword +; MUBUF: buffer_load_dword +; FLATSCR: scratch_load_dword ; FLATSCR: scratch_load_dword ; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { @@ -54,8 +58,10 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: ; MUBUF: buffer_store_dword v40 +; MUBUF: buffer_store_dword v41 ; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, s33, 4 +; FLATSCR: scratch_store_dword off, v41 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x400 @@ -63,9 +69,11 @@ ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s33, v40, 4 +; GCN: v_readlane_b32 s33, v41, 0 ; MUBUF: buffer_load_dword v40 +; MUBUF: buffer_load_dword v41 ; FLATSCR: scratch_load_dword v40 +; FLATSCR: scratch_load_dword v41 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -87,9 +87,11 @@ ; GCN-NEXT: s_waitcnt ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN: v_writelane_b32 [[CSR_VGPR_1]], s33, 0 ; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} ; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} @@ -107,10 +109,12 @@ ; MUBUF: s_addk_i32 s32, 0xfc00{{$}} ; FLATSCR: s_add_i32 s32, s32, -16{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -132,11 +136,13 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: s_addk_i32 s32, 0x400 ; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]] +; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], s33, [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 @@ -147,10 +153,12 @@ ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -269,8 +277,8 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v0, s33, 63 ; GCN-COUNT-60: v_writelane_b32 v0 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; GCN: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -285,7 +293,7 @@ ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, 16 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN-NEXT: v_readlane_b32 s33, v0, 63 +; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload @@ -389,7 +397,7 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 @@ -404,7 +412,7 @@ ; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; MUBUF: s_addk_i32 s32, 0xfd00 ; FLATSCR: s_add_i32 s32, s32, -12 -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload @@ -432,7 +440,7 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x300{{$}} ; FLATSCR: s_add_i32 s32, s32, 12{{$}} @@ -443,7 +451,7 @@ ; GCN: ;;#ASMSTART ; MUBUF: s_addk_i32 s32, 0xfd00{{$}} ; FLATSCR: s_add_i32 s32, s32, -12{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload @@ -480,7 +488,7 @@ ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}} ; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}} @@ -490,7 +498,7 @@ ; GCN: ;;#ASMSTART ; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}} ; FLATSCR: s_addk_i32 s32, 0xeff4{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload @@ -530,7 +538,7 @@ ; An FP is needed, despite not needing any spills ; TODO: Ccould see callee does not use stack and omit FP. ; GCN-LABEL: {{^}}ipra_call_with_stack: -; GCN: v_writelane_b32 v0, s33, 2 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 @@ -539,7 +547,7 @@ ; GCN: s_swappc_b64 ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN: v_readlane_b32 s33, v0, 2 +; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -29,8 +29,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -42,9 +43,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -63,8 +65,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -76,9 +79,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -97,8 +101,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -110,9 +115,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -131,8 +137,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -145,9 +152,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -18,8 +18,8 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s33, 15 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2 @@ -32,6 +32,7 @@ ; CHECK-NEXT: v_writelane_b32 v40, s41, 9 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: v_writelane_b32 v42, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 @@ -87,9 +88,10 @@ ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: v_readlane_b32 s33, v40, 15 +; CHECK-NEXT: v_readlane_b32 s33, v42, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -12,8 +12,9 @@ ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s33, 0 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 @@ -28,9 +29,10 @@ ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 2 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v41, 0 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -11,7 +11,6 @@ ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SDAG-NEXT: s_mov_b64 exec, s[34:35] -; SDAG-NEXT: v_writelane_b32 v40, s33, 28 ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 ; SDAG-NEXT: v_writelane_b32 v40, s6, 2 @@ -32,6 +31,7 @@ ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 +; SDAG-NEXT: s_mov_b32 s36, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 @@ -78,7 +78,7 @@ ; SDAG-NEXT: v_readlane_b32 s5, v40, 1 ; SDAG-NEXT: v_readlane_b32 s4, v40, 0 ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: v_readlane_b32 s33, v40, 28 +; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] @@ -91,7 +91,6 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[34:35] -; GISEL-NEXT: v_writelane_b32 v40, s33, 28 ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 ; GISEL-NEXT: v_writelane_b32 v40, s6, 2 @@ -112,6 +111,7 @@ ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 +; GISEL-NEXT: s_mov_b32 s36, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 @@ -158,7 +158,7 @@ ; GISEL-NEXT: v_readlane_b32 s5, v40, 1 ; GISEL-NEXT: v_readlane_b32 s4, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 28 +; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -98,8 +98,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -113,9 +114,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -126,25 +128,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -156,25 +161,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -189,10 +197,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -206,9 +215,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -219,17 +229,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 @@ -237,9 +248,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -251,17 +264,18 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 @@ -269,9 +283,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -287,10 +303,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -304,9 +321,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -317,17 +335,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 @@ -335,9 +354,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -349,17 +370,18 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 @@ -367,9 +389,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -385,8 +409,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -399,9 +424,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -412,24 +438,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -441,24 +470,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -473,10 +505,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -488,9 +521,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -501,25 +535,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -531,25 +568,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -565,10 +605,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -580,9 +621,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -593,25 +635,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -623,25 +668,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -657,8 +705,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -671,9 +720,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -684,24 +734,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -713,24 +766,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -745,10 +801,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -760,9 +817,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -773,25 +831,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -803,25 +864,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -837,10 +901,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -852,9 +917,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -865,25 +931,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -895,25 +964,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -929,8 +1001,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -943,9 +1016,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -956,24 +1030,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -985,24 +1062,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1017,8 +1097,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1032,9 +1113,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1045,25 +1127,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1075,25 +1160,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1108,11 +1196,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1124,9 +1213,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1137,26 +1227,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1168,26 +1261,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1203,8 +1299,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1220,9 +1317,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1233,27 +1331,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1265,27 +1366,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1300,11 +1404,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1318,9 +1423,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1331,28 +1437,31 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1364,28 +1473,31 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1403,11 +1515,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1423,9 +1536,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1436,30 +1550,33 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 4 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1471,30 +1588,33 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1511,8 +1631,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1525,9 +1646,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1538,24 +1660,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1567,24 +1692,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1599,8 +1727,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1613,9 +1742,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1626,24 +1756,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1655,24 +1788,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1687,8 +1823,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1702,9 +1839,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1715,25 +1853,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1745,25 +1886,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1778,8 +1922,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1794,9 +1939,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1807,26 +1953,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1838,26 +1987,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1872,8 +2024,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1890,9 +2043,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1903,28 +2057,31 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1936,28 +2093,31 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -1972,8 +2132,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -1987,9 +2148,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2000,25 +2162,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2030,25 +2195,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2063,8 +2231,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2080,9 +2249,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2093,27 +2263,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2125,27 +2298,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2160,8 +2336,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2179,9 +2356,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2192,16 +2370,17 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -2212,9 +2391,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2226,16 +2407,17 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -2246,9 +2428,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2263,9 +2447,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2277,9 +2462,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2290,24 +2476,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2319,24 +2508,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2352,9 +2544,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2366,9 +2559,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2379,24 +2573,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2408,24 +2605,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2441,9 +2641,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2455,9 +2656,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2468,24 +2670,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2497,24 +2702,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2530,8 +2738,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2545,9 +2754,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2558,25 +2768,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2588,25 +2801,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2621,8 +2837,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2636,9 +2853,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2649,25 +2867,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2679,25 +2900,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2712,9 +2936,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2726,9 +2951,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2739,24 +2965,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2768,24 +2997,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2801,8 +3033,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2816,9 +3049,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2829,25 +3063,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2859,25 +3096,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2892,9 +3132,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2906,9 +3147,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2919,24 +3161,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2948,24 +3193,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -2981,9 +3229,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -2995,9 +3244,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3008,24 +3258,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3037,24 +3290,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3070,8 +3326,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3085,9 +3342,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3098,25 +3356,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3128,25 +3389,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3161,8 +3425,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3177,9 +3442,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3190,26 +3456,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3221,26 +3490,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3255,8 +3527,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3272,9 +3545,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3285,27 +3559,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3317,27 +3594,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3352,9 +3632,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3366,9 +3647,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3379,24 +3661,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3408,24 +3693,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3441,8 +3729,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3458,9 +3747,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3471,27 +3761,30 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3503,27 +3796,30 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3538,8 +3834,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3556,9 +3853,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3569,28 +3867,31 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3602,28 +3903,31 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3638,10 +3942,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3656,9 +3961,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3669,29 +3975,32 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3703,29 +4012,32 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3742,8 +4054,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -3763,9 +4076,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3776,18 +4090,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -3798,9 +4113,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3812,18 +4129,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -3834,9 +4152,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3851,10 +4171,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3871,9 +4192,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3884,11 +4206,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3897,18 +4221,19 @@ ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3920,11 +4245,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -3933,18 +4260,19 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -3961,10 +4289,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3986,9 +4315,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3999,11 +4329,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4016,18 +4348,19 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4039,11 +4372,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -4056,18 +4391,19 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4084,11 +4420,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] @@ -4112,9 +4449,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4125,11 +4463,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off @@ -4143,20 +4483,21 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4168,11 +4509,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off @@ -4186,20 +4529,21 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4217,10 +4561,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v43, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -4238,10 +4583,11 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v43, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4252,21 +4598,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v43, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4275,10 +4622,12 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v43, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4290,21 +4639,22 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4313,10 +4663,12 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v43, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4332,10 +4684,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4350,9 +4703,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4363,29 +4717,32 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] ; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4397,29 +4754,32 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4436,13 +4796,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 @@ -4453,10 +4814,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4467,17 +4829,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4485,10 +4848,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4500,17 +4865,18 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4518,10 +4884,12 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4541,8 +4909,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 @@ -4563,7 +4932,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4571,6 +4940,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4581,23 +4951,24 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 @@ -4605,14 +4976,16 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4624,10 +4997,11 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:20 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 @@ -4648,14 +5022,16 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:20 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4683,10 +5059,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4719,9 +5096,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4732,14 +5110,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] @@ -4770,9 +5149,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4784,14 +5165,15 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] @@ -4822,9 +5204,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4842,7 +5226,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 32 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 @@ -4921,7 +5305,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 32 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -5129,8 +5513,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -5144,9 +5529,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5157,25 +5543,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5187,25 +5576,28 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5220,9 +5612,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -5236,9 +5629,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5249,26 +5643,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5280,26 +5677,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5314,9 +5714,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -5330,9 +5731,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5343,26 +5745,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5374,26 +5779,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5408,9 +5816,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -5424,9 +5833,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5437,26 +5847,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 42 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5468,26 +5881,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5502,10 +5918,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -5521,9 +5938,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5534,18 +5952,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -5554,9 +5973,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5568,18 +5989,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5588,9 +6010,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5605,14 +6029,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -5628,9 +6053,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5641,13 +6067,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -5665,9 +6092,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5679,13 +6108,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -5703,9 +6133,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5721,12 +6153,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -5746,9 +6179,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5759,18 +6193,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -5785,9 +6220,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5799,18 +6236,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -5825,9 +6263,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5842,8 +6282,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -5852,6 +6292,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 @@ -5871,9 +6312,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 8 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5884,13 +6326,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -5914,9 +6357,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5928,13 +6373,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -5958,9 +6404,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -5978,8 +6426,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -5990,6 +6438,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -6013,9 +6462,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6026,13 +6476,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -6062,9 +6513,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6076,13 +6529,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -6112,9 +6566,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6131,9 +6587,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -6147,9 +6604,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6160,26 +6618,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6191,26 +6652,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6225,9 +6689,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -6241,9 +6706,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6254,26 +6720,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 4.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6285,26 +6754,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6319,10 +6791,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -6338,9 +6811,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6351,18 +6825,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -6371,9 +6846,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6385,18 +6862,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6405,9 +6883,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6422,11 +6902,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 @@ -6444,9 +6925,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6457,18 +6939,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -6480,9 +6963,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6494,18 +6979,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -6517,9 +7003,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6534,13 +7022,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 @@ -6562,9 +7051,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 7 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6575,18 +7065,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -6604,9 +7095,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6618,18 +7111,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -6647,9 +7141,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6664,10 +7160,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -6683,9 +7180,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6696,18 +7194,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -6716,9 +7215,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6730,18 +7231,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6750,9 +7252,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6767,12 +7271,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -6792,9 +7297,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6805,18 +7311,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -6831,9 +7338,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6845,18 +7354,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -6871,9 +7381,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -6888,14 +7400,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 @@ -6919,9 +7432,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 8 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6932,18 +7446,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -6964,9 +7479,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6978,18 +7495,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7010,9 +7528,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7027,10 +7547,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -7043,9 +7564,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7056,26 +7578,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7087,26 +7612,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7122,11 +7650,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7140,9 +7669,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7153,12 +7683,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -7172,9 +7703,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7186,12 +7719,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -7205,9 +7739,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7223,11 +7759,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7241,9 +7778,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7254,12 +7792,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -7273,9 +7812,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7287,12 +7828,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -7306,9 +7848,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7324,10 +7868,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7343,9 +7888,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7356,18 +7902,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7376,9 +7923,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7390,18 +7939,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7410,9 +7960,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7427,10 +7979,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7446,9 +7999,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7459,18 +8013,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7479,9 +8034,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7493,18 +8050,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7513,9 +8071,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7530,11 +8090,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7548,9 +8109,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7561,12 +8123,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -7580,9 +8143,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7594,12 +8159,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -7613,9 +8179,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7631,10 +8199,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7650,9 +8219,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7663,18 +8233,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7683,9 +8254,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7697,18 +8270,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7717,9 +8291,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7734,10 +8310,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -7750,9 +8327,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7763,26 +8341,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7794,26 +8375,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7829,11 +8413,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7847,9 +8432,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7860,12 +8446,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -7879,9 +8466,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7893,12 +8482,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -7912,9 +8502,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -7930,10 +8522,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -7949,9 +8542,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7962,18 +8556,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7982,9 +8577,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7996,18 +8593,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8016,9 +8614,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8033,11 +8633,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 @@ -8055,9 +8656,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8068,18 +8670,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -8091,9 +8694,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8105,18 +8710,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -8128,9 +8734,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8145,12 +8753,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -8170,9 +8779,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8183,18 +8793,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8209,9 +8820,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8223,18 +8836,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8249,9 +8863,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8266,13 +8882,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -8288,9 +8905,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8301,12 +8919,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8324,9 +8943,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8338,12 +8959,13 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8361,9 +8983,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8379,12 +9003,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 @@ -8404,9 +9029,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8417,18 +9043,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8443,9 +9070,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8457,18 +9086,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8483,9 +9113,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8500,13 +9132,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 @@ -8528,9 +9161,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 7 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8541,18 +9175,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8570,9 +9205,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8584,18 +9221,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8613,9 +9251,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8630,8 +9270,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -8643,6 +9283,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -8662,9 +9303,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8675,13 +9317,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8708,9 +9351,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8722,13 +9367,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8755,9 +9401,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8774,8 +9422,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -8784,6 +9432,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 @@ -8811,9 +9460,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8824,18 +9474,19 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8862,9 +9513,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8876,18 +9529,19 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8914,9 +9568,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -8931,8 +9587,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 18 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -8952,6 +9608,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 @@ -8979,9 +9636,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 18 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8992,13 +9650,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9041,9 +9700,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9055,13 +9716,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9104,9 +9766,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9123,8 +9787,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9153,6 +9817,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 @@ -9215,9 +9880,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 28 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9228,13 +9894,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9322,9 +9989,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9336,13 +10005,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9426,9 +10096,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9445,8 +10117,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9476,6 +10148,7 @@ ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 @@ -9542,9 +10215,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 28 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9555,13 +10229,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9654,9 +10329,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9668,13 +10345,14 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9763,9 +10441,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9783,12 +10463,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -9801,10 +10482,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9815,15 +10497,16 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 @@ -9835,10 +10518,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9850,26 +10535,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9885,8 +10573,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 12 @@ -9938,9 +10627,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9951,12 +10641,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: v_mov_b32_e32 v2, 14 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 @@ -10005,9 +10696,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10019,22 +10712,23 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2 @@ -10070,9 +10764,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10100,8 +10796,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 @@ -10161,9 +10858,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10174,12 +10872,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -10236,9 +10935,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10250,9 +10951,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 @@ -10261,6 +10962,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -10306,9 +11008,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -10332,8 +11036,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 @@ -10393,9 +11098,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10406,12 +11112,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -10468,9 +11175,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10482,9 +11191,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000 @@ -10493,6 +11202,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -10538,9 +11248,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -10,10 +10,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 @@ -30,9 +31,10 @@ ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -43,12 +45,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -64,9 +67,11 @@ ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -134,9 +139,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -157,9 +163,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -170,15 +177,16 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: ;;#ASMSTART @@ -194,9 +202,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -213,8 +223,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -236,9 +247,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -249,21 +261,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v31 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART @@ -273,9 +286,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -293,9 +308,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -316,9 +332,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -329,20 +346,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 @@ -353,9 +371,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -372,9 +392,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -395,9 +416,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -408,20 +430,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s34 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_mov_b32 s34, s4 @@ -432,9 +455,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -451,8 +476,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 2 +; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 @@ -472,9 +498,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -485,20 +512,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 @@ -507,9 +535,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -610,8 +640,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -623,9 +654,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -636,23 +668,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s33@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -667,8 +702,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -680,9 +716,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -693,23 +730,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s34@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -724,9 +764,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -746,9 +787,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -759,20 +801,21 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -782,9 +825,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -801,9 +846,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v42, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 @@ -832,9 +878,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: v_readlane_b32 s33, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -845,26 +892,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v42, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_mov_b32_e32 v41, v32 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 @@ -877,9 +925,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: v_readlane_b32 s33, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -26,7 +26,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v1, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -40,7 +40,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v1, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -55,21 +55,21 @@ ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v1, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -105,7 +105,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v1, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -119,7 +119,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v1, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -134,21 +134,21 @@ ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v1, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -184,7 +184,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v1, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -198,7 +198,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v1, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -213,21 +213,21 @@ ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v1, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -265,7 +265,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -279,7 +279,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -294,21 +294,21 @@ ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -1372,7 +1372,7 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v2, s33, 2 +; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000 ; GFX9-NEXT: s_add_i32 s32, s32, 0x60000 @@ -1388,7 +1388,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 -; GFX9-NEXT: v_readlane_b32 s33, v2, 2 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] @@ -1403,7 +1403,7 @@ ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v2, s33, 2 +; GFX10-NEXT: s_mov_b32 s36, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 ; GFX10-NEXT: s_add_i32 s32, s32, 0x30000 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 @@ -1419,7 +1419,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 -; GFX10-NEXT: v_readlane_b32 s33, v2, 2 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -395,8 +395,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 17 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -461,9 +462,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 17 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -473,8 +475,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 17 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -539,9 +542,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 17 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -555,8 +559,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 17 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -624,9 +629,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 17 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -636,8 +642,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 17 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -703,9 +710,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 17 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -719,8 +727,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 17 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -787,9 +796,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 17 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -799,8 +809,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 17 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -867,9 +878,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 17 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -884,8 +896,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 19 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -961,9 +974,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 19 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -973,8 +987,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v40, s33, 19 +; GISEL-NEXT: v_writelane_b32 v41, s33, 0 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1050,9 +1065,10 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 19 +; GISEL-NEXT: v_readlane_b32 s33, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1074,7 +1090,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1155,7 +1171,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1168,7 +1184,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s5, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1249,7 +1265,7 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s5 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] @@ -1266,7 +1282,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1351,7 +1367,7 @@ ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1364,7 +1380,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1449,7 +1465,7 @@ ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] @@ -1470,7 +1486,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1553,7 +1569,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1566,7 +1582,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1649,7 +1665,7 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] @@ -1667,7 +1683,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 32 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1747,7 +1763,7 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 32 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1760,7 +1776,7 @@ ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_writelane_b32 v40, s33, 32 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1840,7 +1856,7 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v40, 32 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -190,9 +190,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v44, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 @@ -224,9 +225,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: v_readlane_b32 s33, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v1, s33, 2 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -51,7 +51,7 @@ ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v1, 2 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] @@ -155,7 +155,7 @@ ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v1, s33, 2 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 @@ -171,7 +171,7 @@ ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v1, 2 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] @@ -189,7 +189,7 @@ ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v2, s33, 2 +; CHECK-NEXT: s_mov_b32 s7, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v2, s30, 0 @@ -205,7 +205,7 @@ ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v2, 2 +; CHECK-NEXT: s_mov_b32 s33, s7 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -13,8 +13,9 @@ ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v40, s33, 2 +; GCN-DAG: v_writelane_b32 v41, s33, 0 ; GCN-DAG: s_mov_b32 s33, s32 ; GCN-DAG: s_addk_i32 s32, 0x400 ; GCN-DAG: v_writelane_b32 v40, s30, 0 @@ -26,9 +27,10 @@ ; GCN: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -15,8 +15,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v40, s33, 2 +; CHECK-NEXT: v_writelane_b32 v41, s33, 0 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: .Ltmp0: @@ -38,9 +39,10 @@ ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v40, 2 +; CHECK-NEXT: v_readlane_b32 s33, v41, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -24,7 +24,7 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX8-LABEL: name: pei_scavenge_vgpr_spill - ; GFX8: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2 + ; GFX8: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc @@ -51,7 +51,7 @@ ; GFX8-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill - ; GFX9: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2 + ; GFX9: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc @@ -76,7 +76,7 @@ ; GFX9-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill - ; GFX9-FLATSCR: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2 + ; GFX9-FLATSCR: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/save-fp.ll b/llvm/test/CodeGen/AMDGPU/save-fp.ll --- a/llvm/test/CodeGen/AMDGPU/save-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/save-fp.ll @@ -11,14 +11,14 @@ ; GCN-LABEL: {{^}}caller: -; GCN: v_writelane_b32 v2, s33, 2 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; GFX900: buffer_store_dword ; GFX908-DAG: v_accvgpr_write_b32 ; GCN: s_swappc_b64 ; GFX900: buffer_load_dword ; GFX908: v_accvgpr_read_b32 -; GCN: v_readlane_b32 s33, v2, 2 +; GCN: s_mov_b32 s33, [[TMP_SGPR]] define i64 @caller() { bb: call void asm sideeffect "", "~{v40}" () diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -7,13 +7,13 @@ ; GCN-LABEL: {{^}}spill_sgpr_with_no_lower_vgpr_available: ; GCN: buffer_store_dword v255, off, s[0:3], s32 -; GCN: v_writelane_b32 v255, s33, 2 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: v_writelane_b32 v255, s30, 0 ; GCN: v_writelane_b32 v255, s31, 1 ; GCN: s_swappc_b64 s[30:31], s[4:5] ; GCN: v_readlane_b32 s31, v255, 1 ; GCN: v_readlane_b32 s30, v255, 0 -; GCN: v_readlane_b32 s33, v255, 2 +; GCN: s_mov_b32 s33, [[TMP_SGPR]] ; GCN: ; NumVgprs: 256 define void @spill_sgpr_with_no_lower_vgpr_available() #0 { @@ -53,13 +53,13 @@ ; GCN-LABEL: {{^}}spill_to_lowest_available_vgpr: ; GCN: buffer_store_dword v254, off, s[0:3], s32 -; GCN: v_writelane_b32 v254, s33, 2 +; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 ; GCN: v_writelane_b32 v254, s30, 0 ; GCN: v_writelane_b32 v254, s31, 1 ; GCN: s_swappc_b64 s[30:31], s[4:5] ; GCN: v_readlane_b32 s31, v254, 1 ; GCN: v_readlane_b32 s30, v254, 0 -; GCN: v_readlane_b32 s33, v254, 2 +; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @spill_to_lowest_available_vgpr() #0 { %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -200,9 +200,10 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 [[CSRV]], s33, 2 -; GCN-DAG: s_addk_i32 s32, 0x400 +; GCN: v_writelane_b32 [[CSRV_1]], s33, 0 +; GCN-DAG: s_addk_i32 s32, 0x800 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -226,10 +227,11 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1 -; GCN: s_addk_i32 s32, 0xfc00 +; GCN: s_addk_i32 s32, 0xf800 ; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -3,16 +3,18 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, s33, 2 +; GCN: v_writelane_b32 v41, s33, 0 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v40, 2 +; GCN: v_readlane_b32 s33, v41, 0 ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -157,12 +157,13 @@ ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: ; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 +; GCN-NEXT: v_writelane_b32 [[VGPR_REG_1]], s33, 0 ; GCN-DAG: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 ; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -174,10 +175,11 @@ ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 ; GCN: s_add_i32 s32, s32, 0xfffd0000 -; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 +; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG_1]], 0 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -20,8 +20,8 @@ ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_writelane_b32 v1, s33, 3 ; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: s_mov_b32 s36, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v1, s30, 1 @@ -36,7 +36,7 @@ ; GCN-NEXT: v_readlane_b32 s30, v1, 1 ; GCN-NEXT: v_readlane_b32 s4, v1, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v1, 3 +; GCN-NEXT: s_mov_b32 s33, s36 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -8,8 +8,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -51,9 +52,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: v_readlane_b32 s33, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -184,8 +186,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 17 +; GCN-NEXT: v_writelane_b32 v46, s33, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x800 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -12,9 +12,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v45, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -61,9 +62,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v45, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -74,6 +76,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v36, v16 @@ -82,7 +85,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v45, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_mov_b32 s6, s4 @@ -126,9 +129,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v45, 0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -163,8 +168,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_writelane_b32 v40, s36, 2 @@ -173,6 +178,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s39, 5 ; GFX9-NEXT: v_writelane_b32 v40, s40, 6 ; GFX9-NEXT: v_writelane_b32 v40, s41, 7 +; GFX9-NEXT: v_writelane_b32 v46, s33, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s42, 8 ; GFX9-NEXT: s_mov_b32 s36, 0 @@ -222,9 +228,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v46, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -235,25 +242,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v46, s33, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: v_mov_b32_e32 v42, v15 ; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_mov_b32_e32 v45, v12 ; GFX10-NEXT: v_writelane_b32 v40, s36, 2 ; GFX10-NEXT: s_mov_b32 s36, 0 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 +; GFX10-NEXT: v_mov_b32_e32 v45, v12 ; GFX10-NEXT: v_writelane_b32 v40, s37, 3 ; GFX10-NEXT: s_mov_b32 s37, s36 ; GFX10-NEXT: v_writelane_b32 v40, s38, 4 @@ -296,9 +304,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v46, 0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1121,11 +1121,12 @@ ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v41, s33, 0 ; GCN: s_mov_b32 s33, s32 ; GFX1064: s_addk_i32 s32, 0x400 ; GFX1032: s_addk_i32 s32, 0x200 @@ -1140,10 +1141,12 @@ ; GFX1064: s_addk_i32 s32, 0xfc00 ; GFX1032: s_addk_i32 s32, 0xfe00 -; GCN: v_readlane_b32 s33, v40, 2 +; GCN: v_readlane_b32 s33, v41, 0 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -336,7 +336,7 @@ ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-O0-NEXT: s_mov_b32 s35, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 @@ -370,7 +370,7 @@ ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-O0-NEXT: s_mov_b32 s33, s35 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -387,8 +387,8 @@ ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v3, s33, 2 ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 @@ -410,7 +410,7 @@ ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-O3-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -526,7 +526,7 @@ ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_writelane_b32 v10, s33, 8 +; GFX9-O0-NEXT: s_mov_b32 s42, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 @@ -590,7 +590,7 @@ ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: v_readlane_b32 s33, v10, 8 +; GFX9-O0-NEXT: s_mov_b32 s33, s42 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 @@ -630,8 +630,8 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v8, s33, 2 ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 +; GFX9-O3-NEXT: s_mov_b32 s40, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 @@ -663,7 +663,7 @@ ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: v_readlane_b32 s33, v8, 2 +; GFX9-O3-NEXT: s_mov_b32 s33, s40 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0