diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -10,6 +10,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H #include "AMDGPUFrameLowering.h" +#include "SIRegisterInfo.h" namespace llvm { @@ -35,6 +36,14 @@ RegScavenger *RS = nullptr) const; void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs) const; + void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + LivePhysRegs &LiveRegs, Register FrameReg, + Register FramePtrRegScratchCopy) const; + void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + LivePhysRegs &LiveRegs, Register FrameReg, + Register FramePtrRegScratchCopy) const; bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -126,7 +126,8 @@ LivePhysRegs &LiveRegs, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - Register SpillReg, int FI, int64_t DwordOff = 0) { + Register SpillReg, int FI, Register FrameReg, + int64_t DwordOff = 0) { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; @@ -137,18 +138,20 @@ FrameInfo.getObjectAlign(FI)); LiveRegs.addReg(SpillReg); bool IsKill = !MBB.isLiveIn(SpillReg); - TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, - FuncInfo.getStackPtrOffsetReg(), DwordOff, MMO, - nullptr, &LiveRegs); + TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, + DwordOff, MMO, nullptr, &LiveRegs); if (IsKill) LiveRegs.removeReg(SpillReg); } -static void buildEpilogRestore( - const GCNSubtarget &ST, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &FuncInfo, LivePhysRegs &LiveRegs, - MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, Register SpillReg, int FI, int64_t DwordOff = 0) { +static void buildEpilogRestore(const GCNSubtarget &ST, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &FuncInfo, + LivePhysRegs &LiveRegs, MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, Register SpillReg, int FI, + Register FrameReg, int64_t DwordOff = 0) { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; @@ -157,9 +160,8 @@ MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), FrameInfo.getObjectAlign(FI)); - TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, - FuncInfo.getStackPtrOffsetReg(), DwordOff, MMO, - nullptr, &LiveRegs); + TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, + DwordOff, MMO, nullptr, &LiveRegs); } static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -222,6 +224,7 @@ const PrologEpilogSGPRSaveRestoreInfo SI; LivePhysRegs &LiveRegs; const DebugLoc &DL; + Register FrameReg; ArrayRef SplitParts; unsigned NumSubRegs; unsigned EltSize = 4; @@ -245,7 +248,7 @@ .addReg(SubReg); buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, - FI, DwordOff); + FI, FrameReg, DwordOff); DwordOff += 4; } } @@ -290,7 +293,7 @@ : Register(TRI.getSubReg(SuperReg, SplitParts[I])); buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, - FI, DwordOff); + FI, FrameReg, DwordOff); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpVGPR, RegState::Kill); DwordOff += 4; @@ -326,11 +329,11 @@ MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, - LivePhysRegs &LiveRegs) + LivePhysRegs &LiveRegs, Register FrameReg) : MI(MI), MBB(MBB), MF(*MBB.getParent()), ST(MF.getSubtarget()), MFI(MF.getFrameInfo()), FuncInfo(MF.getInfo()), TII(TII), TRI(TRI), - SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL) { + SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) { const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); SplitParts = TRI.getRegSplitParts(RC, EltSize); NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); @@ -898,41 +901,19 @@ return ScratchExecCopy; } -void SIFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { +void SIFrameLowering::emitCSRSpillStores( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, + Register FrameReg, Register FramePtrRegScratchCopy) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - if (FuncInfo->isEntryFunction()) { - emitEntryFunctionPrologue(MF, MBB); - return; - } - - MachineFrameInfo &MFI = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - Register FramePtrReg = FuncInfo->getFrameOffsetReg(); - Register BasePtrReg = - TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - LivePhysRegs LiveRegs; - - MachineBasicBlock::iterator MBBI = MBB.begin(); - // DebugLoc must be unknown since the first instruction with DebugLoc is used - // to determine the end of the prologue. - DebugLoc DL; - - bool HasFP = false; - bool HasBP = false; - uint32_t NumBytes = MFI.getStackSize(); - uint32_t RoundedSize = NumBytes; - // To avoid clobbering VGPRs in lanes that weren't active on function entry, - // turn on all lanes before doing the spill to memory. - Register ScratchExecCopy; - // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch // registers. However, save all lanes of callee-saved VGPRs. Due to this, we // might end up flipping the EXEC bits twice. + Register ScratchExecCopy; SmallVector, 2> WWMCalleeSavedRegs, WWMScratchRegs; FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); if (!WWMScratchRegs.empty()) @@ -946,7 +927,7 @@ Register VGPR = Reg.first; int FI = Reg.second; buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, - VGPR, FI); + VGPR, FI, FrameReg); } }; @@ -973,9 +954,20 @@ LiveRegs.addReg(ScratchExecCopy); } + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { - PrologEpilogSGPRSpillBuilder SB(Spill.first, Spill.second, MBB, MBBI, DL, - TII, TRI, LiveRegs); + // Special handle FP spill: + // Skip if FP is saved to a scratch SGPR, the save has already been emitted. + // Otherwise, FP has been moved to a temporary register and spill it + // instead. + Register Reg = + Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; + if (!Reg) + continue; + + PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, + LiveRegs, FrameReg); SB.save(); } @@ -995,9 +987,147 @@ LiveRegs.addReg(Reg); } } +} + +void SIFrameLowering::emitCSRSpillRestores( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, + Register FrameReg, Register FramePtrRegScratchCopy) const { + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + + for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { + // Special handle FP restore: + // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore + // the FP value to a temporary register. The frame pointer should be + // overwritten only at the end when all other spills are restored from + // current frame. + Register Reg = + Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; + if (!Reg) + continue; + + PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, + LiveRegs, FrameReg); + SB.restore(); + } + + // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the + // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to + // this, we might end up flipping the EXEC bits twice. + Register ScratchExecCopy; + SmallVector, 2> WWMCalleeSavedRegs, WWMScratchRegs; + FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); + if (!WWMScratchRegs.empty()) + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + /*IsProlog*/ false, /*EnableInactiveLanes*/ true); + + auto RestoreWWMRegisters = + [&](SmallVectorImpl> &WWMRegs) { + for (const auto &Reg : WWMRegs) { + Register VGPR = Reg.first; + int FI = Reg.second; + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + VGPR, FI, FrameReg); + } + }; + + RestoreWWMRegisters(WWMScratchRegs); + if (!WWMCalleeSavedRegs.empty()) { + if (ScratchExecCopy) { + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + } else { + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + /*IsProlog*/ false, + /*EnableInactiveLanes*/ false); + } + } - if (TRI.hasStackRealignment(MF)) { + RestoreWWMRegisters(WWMCalleeSavedRegs); + if (ScratchExecCopy) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + } +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + if (FuncInfo->isEntryFunction()) { + emitEntryFunctionPrologue(MF, MBB); + return; + } + + MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = + TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); + LivePhysRegs LiveRegs; + + MachineBasicBlock::iterator MBBI = MBB.begin(); + // DebugLoc must be unknown since the first instruction with DebugLoc is used + // to determine the end of the prologue. + DebugLoc DL; + + bool HasFP = false; + bool HasBP = false; + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = NumBytes; + + if (TRI.hasStackRealignment(MF)) HasFP = true; + + Register FramePtrRegScratchCopy; + if (!HasFP && !hasFP(MF)) { + // Emit the CSR spill stores with SP base register. + emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, + FramePtrRegScratchCopy); + } else { + // CSR spill stores will use FP as base register. + Register SGPRForFPSaveRestoreCopy = + FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); + + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); + if (SGPRForFPSaveRestoreCopy) { + // Copy FP to the scratch register now and emit the CFI entry. It avoids + // the extra FP copy needed in the other two cases when FP is spilled to + // memory or to a VGPR lane. + PrologEpilogSGPRSpillBuilder SB( + FramePtrReg, + FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, + DL, TII, TRI, LiveRegs, FramePtrReg); + SB.save(); + LiveRegs.addReg(SGPRForFPSaveRestoreCopy); + } else { + // Copy FP into a new scratch register so that its previous value can be + // spilled after setting up the new frame. + FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); + if (!FramePtrRegScratchCopy) + report_fatal_error("failed to find free scratch register"); + + LiveRegs.addReg(FramePtrRegScratchCopy); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) + .addReg(FramePtrReg); + } + } + + if (HasFP) { const unsigned Alignment = MFI.getMaxAlign().value(); RoundedSize += Alignment; @@ -1024,6 +1154,14 @@ .setMIFlag(MachineInstr::FrameSetup); } + // If FP is used, emit the CSR spills with FP base register. + if (HasFP) { + emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, + FramePtrRegScratchCopy); + if (FramePtrRegScratchCopy) + LiveRegs.removeReg(FramePtrRegScratchCopy); + } + // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -1067,6 +1205,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); LivePhysRegs LiveRegs; // Get the insert location for the epilogue. If there were no terminators in // the block, get the last instruction. @@ -1086,6 +1225,32 @@ ? NumBytes + MFI.getMaxAlign().value() : NumBytes; const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); + + Register FramePtrRegScratchCopy; + Register SGPRForFPSaveRestoreCopy = + FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); + if (FPSaved) { + // CSR spill restores should use FP as base register. If + // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP + // into a new scratch register and copy to FP later when other registers are + // restored from the current stack frame. + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); + if (SGPRForFPSaveRestoreCopy) { + LiveRegs.addReg(SGPRForFPSaveRestoreCopy); + } else { + FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); + if (!FramePtrRegScratchCopy) + report_fatal_error("failed to find free scratch register"); + + LiveRegs.addReg(FramePtrRegScratchCopy); + } + + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, + FramePtrRegScratchCopy); + } if (RoundedSize != 0 && hasFP(MF)) { auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) @@ -1095,52 +1260,19 @@ Add->getOperand(3).setIsDead(); // Mark SCC as dead. } - for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { - PrologEpilogSGPRSpillBuilder SB(Spill.first, Spill.second, MBB, MBBI, DL, - TII, TRI, LiveRegs); - SB.restore(); - } - - // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the - // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to - // this, we might end up flipping the EXEC bits twice. - Register ScratchExecCopy; - SmallVector, 2> WWMCalleeSavedRegs, WWMScratchRegs; - FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); - if (!WWMScratchRegs.empty()) - ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ false, - /*EnableInactiveLanes*/ true); - - auto RestoreWWMRegisters = - [&](SmallVectorImpl> &WWMRegs) { - for (const auto &Reg : WWMRegs) { - Register VGPR = Reg.first; - int FI = Reg.second; - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, - VGPR, FI); - } - }; - RestoreWWMRegisters(WWMScratchRegs); - if (!WWMCalleeSavedRegs.empty()) { - if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); - } else { - ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ false, - /*EnableInactiveLanes*/ false); - } - } - - RestoreWWMRegisters(WWMCalleeSavedRegs); - if (ScratchExecCopy) { - // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + if (FPSaved) { + // Insert the copy to restore FP. + Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy + : FramePtrRegScratchCopy; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) + .addReg(SrcReg); + if (SGPRForFPSaveRestoreCopy) + MIB.setMIFlag(MachineInstr::FrameDestroy); + } else { + // Insert the CSR spill restores with SP as the base register. + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, + FramePtrRegScratchCopy); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -608,6 +608,14 @@ }) != PrologEpilogSGPRSpills.end(); } + const PrologEpilogSGPRSaveRestoreInfo & + getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const { + auto I = PrologEpilogSGPRSpills.find(Reg); + assert(I != PrologEpilogSGPRSpills.end()); + + return I->second; + } + ArrayRef getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -7,16 +7,17 @@ ; CHECK-LABEL: call_assert_align: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v41, s33, 0 +; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_writelane_b32 v41, s16, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4 @@ -27,12 +28,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v41, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: v_readlane_b32 s33, v41, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -233,12 +233,12 @@ ; MUBUF-LABEL: func_caller_stack: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 +; MUBUF-NEXT: s_mov_b32 s4, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 @@ -248,6 +248,7 @@ ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: v_mov_b32_e32 v0, 12 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_getpc_b64 s[4:5] @@ -256,24 +257,25 @@ ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: v_readlane_b32 s33, v41, 0 -; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_caller_stack: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 +; FLATSCR-NEXT: s_mov_b32 s0, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 @@ -283,6 +285,7 @@ ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] @@ -291,12 +294,13 @@ ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 0 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0 -; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> ) @@ -307,16 +311,17 @@ ; MUBUF-LABEL: func_caller_byval: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_writelane_b32 v41, s33, 0 -; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 @@ -377,27 +382,29 @@ ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: v_readlane_b32 s33, v41, 0 -; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_caller_byval: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off -; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0 +; FLATSCR-NEXT: s_mov_b32 s0, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 @@ -428,12 +435,13 @@ ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 0 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0 -; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %argptr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -232,28 +232,30 @@ ; GFX9-LABEL: sink_null_insert_pt: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v41, s16, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -17,14 +17,15 @@ ; FIXEDABI-LABEL: parent_func_missing_inputs: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIXEDABI-NEXT: s_or_saveexec_b64 s[16:17], -1 -; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; FIXEDABI-NEXT: s_mov_b64 exec, s[16:17] -; FIXEDABI-NEXT: v_writelane_b32 v41, s33, 0 +; FIXEDABI-NEXT: s_mov_b32 s16, s33 ; FIXEDABI-NEXT: s_mov_b32 s33, s32 +; FIXEDABI-NEXT: s_or_saveexec_b64 s[18:19], -1 +; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19] ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 +; FIXEDABI-NEXT: v_writelane_b32 v41, s16, 0 ; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_getpc_b64 s[16:17] ; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4 @@ -32,12 +33,13 @@ ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 +; FIXEDABI-NEXT: v_readlane_b32 s4, v41, 0 +; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7] ; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 -; FIXEDABI-NEXT: v_readlane_b32 s33, v41, 0 -; FIXEDABI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; FIXEDABI-NEXT: s_mov_b64 exec, s[4:5] +; FIXEDABI-NEXT: s_mov_b32 s33, s4 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_setpc_b64 s[30:31] call void @requires_all_inputs() diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -200,7 +200,7 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: simple_lds_recurse: ; GCN-NEXT: .lds_size: 0x100{{$}} -; GCN-NEXT: .sgpr_count: 0x26{{$}} +; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_stack: @@ -227,7 +227,7 @@ ; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x26{{$}} +; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1417,11 +1417,11 @@ ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v2, s30, 0 @@ -1437,22 +1437,22 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v2, 1 ; GCN-NEXT: v_readlane_b32 s30, v2, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 @@ -1467,22 +1467,22 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0 -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 @@ -1497,22 +1497,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 @@ -1526,11 +1526,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1538,12 +1538,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 @@ -1557,12 +1557,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1575,11 +1575,11 @@ ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v3, s30, 0 @@ -1599,22 +1599,22 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v3, 1 ; GCN-NEXT: v_readlane_b32 s30, v3, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1633,22 +1633,22 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v3, 1 ; GFX7-NEXT: v_readlane_b32 s30, v3, 0 -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1662,22 +1662,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1691,11 +1691,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1703,12 +1703,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1722,12 +1722,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1740,11 +1740,11 @@ ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v4, s30, 0 @@ -1765,22 +1765,22 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v4, 1 ; GCN-NEXT: v_readlane_b32 s30, v4, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1800,22 +1800,22 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1833,22 +1833,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xffff @@ -1868,11 +1868,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1880,12 +1880,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1904,12 +1904,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1922,11 +1922,11 @@ ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v5, s30, 0 @@ -1954,22 +1954,22 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v5, 1 ; GCN-NEXT: v_readlane_b32 s30, v5, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -1996,22 +1996,22 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v5, 1 ; GFX7-NEXT: v_readlane_b32 s30, v5, 0 -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2036,22 +2036,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2071,11 +2071,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2083,12 +2083,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2108,12 +2108,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2126,11 +2126,11 @@ ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v9, s30, 0 @@ -2174,22 +2174,22 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v9, 1 ; GCN-NEXT: v_readlane_b32 s30, v9, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2232,22 +2232,22 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v9, 1 ; GFX7-NEXT: v_readlane_b32 s30, v9, 0 -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2286,22 +2286,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v5, 1 ; GFX8-NEXT: v_readlane_b32 s30, v5, 0 -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2329,11 +2329,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v5, 1 ; GFX9-NEXT: v_readlane_b32 s30, v5, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2341,12 +2341,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2374,12 +2374,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v5, 1 ; GFX10-NEXT: v_readlane_b32 s30, v5, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2392,11 +2392,11 @@ ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v17, s30, 0 @@ -2472,22 +2472,22 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v17, 1 ; GCN-NEXT: v_readlane_b32 s30, v17, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2562,22 +2562,22 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v17, 1 ; GFX7-NEXT: v_readlane_b32 s30, v17, 0 -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2644,22 +2644,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v9, 1 ; GFX8-NEXT: v_readlane_b32 s30, v9, 0 -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2703,11 +2703,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v9, 1 ; GFX9-NEXT: v_readlane_b32 s30, v9, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2715,12 +2715,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 @@ -2764,12 +2764,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v9, 1 ; GFX10-NEXT: v_readlane_b32 s30, v9, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -14,13 +14,15 @@ } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v41, s33, 0 +; GCN: s_mov_b32 s4, s33 +; GCN: v_writelane_b32 v41, s4, 0 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s33, v41, 0 +; GCN: v_readlane_b32 s4, v41, 0 +; GCN: s_mov_b32 s33, s4 ; GCN: s_setpc_b64 s[30:31] ; GCN: ; NumSgprs: 36 ; GCN: ; NumVgprs: 42 diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -22,14 +22,15 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; MUBUF: buffer_store_dword ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; FLATSCR: scratch_store_dword ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 -; GCN: v_writelane_b32 v41, s33, 0 ; GCN: v_writelane_b32 v40, s34, 2 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: v_writelane_b32 v40, s35, 3 ; GCN: s_swappc_b64 @@ -43,11 +44,12 @@ ; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 ; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s33, v41, 0 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; MUBUF: buffer_load_dword ; MUBUF: buffer_load_dword ; FLATSCR: scratch_load_dword ; FLATSCR: scratch_load_dword +; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() @@ -57,23 +59,25 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN: s_mov_b32 s33, s32 ; MUBUF: buffer_store_dword v40 ; MUBUF: buffer_store_dword v41 ; FLATSCR: scratch_store_dword off, v40 ; FLATSCR: scratch_store_dword off, v41 -; GCN: v_writelane_b32 v41, s33, 0 - -; GCN: s_mov_b32 s33, s32 ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 + ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s33, v41, 0 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; MUBUF: buffer_load_dword v40 ; MUBUF: buffer_load_dword v41 ; FLATSCR: scratch_load_dword v40 ; FLATSCR: scratch_load_dword v41 +; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -85,18 +85,19 @@ ; GCN-LABEL: {{^}}callee_with_stack_and_call: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR_1]], s33, 0 -; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} ; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, ; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} @@ -107,15 +108,16 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] -; MUBUF: s_addk_i32 s32, 0xfc00{{$}} -; FLATSCR: s_add_i32 s32, s32, -16{{$}} -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], 0 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; MUBUF: s_addk_i32 s32, 0xfc00{{$}} +; FLATSCR: s_add_i32 s32, s32, -16{{$}} +; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -134,15 +136,17 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: s_addk_i32 s32, 0x400 ; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], s33, [[FP_SPILL_LANE:[0-9]+]] +; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 @@ -151,15 +155,16 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 -; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], [[FP_SPILL_LANE]] +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; MUBUF: s_addk_i32 s32, 0xfc00 +; FLATSCR: s_add_i32 s32, s32, -16 +; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] define void @callee_no_stack_with_call() #0 { @@ -273,14 +278,12 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt +; GCN-NEXT: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 +; GCN: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-60: v_writelane_b32 v0 -; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; GCN: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN: v_writelane_b32 v0 @@ -290,14 +293,14 @@ ; GCN: v_writelane_b32 v0 ; MUBUF: s_addk_i32 s32, 0x400 -; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, 16 +; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { @@ -319,15 +322,13 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-COUNT-61: v_writelane_b32 v0, -; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; FLATSCR-NEXT: s_mov_b32 s33, s32 -; MUBUF: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN: v_writelane_b32 v0, @@ -340,13 +341,13 @@ ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0 +; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { @@ -393,30 +394,30 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; MUBUF: s_addk_i32 s32, 0x300 -; FLATSCR: s_add_i32 s32, s32, 12 ; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART +; MUBUF: s_addk_i32 s32, 0x300 +; FLATSCR: s_add_i32 s32, s32, 12 ; GCN: v_readlane_b32 s31, [[CSR_VGPR]], 1 ; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 +; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: s_addk_i32 s32, 0xfd00 ; FLATSCR: s_add_i32 s32, s32, -12 ; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] define void @no_unused_non_csr_sgpr_for_fp() #1 { @@ -436,26 +437,26 @@ ; Need a new CSR VGPR to satisfy the FP spill. ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; GCN: s_waitcnt -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; MUBUF: s_addk_i32 s32, 0x300{{$}} -; FLATSCR: s_add_i32 s32, s32, 12{{$}} +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: buffer_store_dword ; FLATSCR-DAG: scratch_store_dword +; MUBUF: s_addk_i32 s32, 0x300{{$}} +; FLATSCR: s_add_i32 s32, s32, 12{{$}} ; GCN: ;;#ASMSTART +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: s_addk_i32 s32, 0xfd00{{$}} ; FLATSCR: s_add_i32 s32, s32, -12{{$}} ; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { @@ -482,29 +483,29 @@ ; register is needed to access the CSR VGPR slot. ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: ; GCN: s_waitcnt +; GCN-NEXT: s_mov_b32 vcc_lo, s33 +; GCN-DAG: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 +; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-DAG: s_mov_b32 s33, s32 ; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}} ; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}} ; MUBUF-DAG: buffer_store_dword ; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART -; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}} -; FLATSCR: s_addk_i32 s32, 0xeff4{{$}} -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 +; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}} +; FLATSCR: s_addk_i32 s32, 0xeff4{{$}} +; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #1 { @@ -557,14 +558,16 @@ ; With no free registers, we must spill the FP to memory. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; FLATSCR: s_mov_b32 s0, s33 ; GCN: s_mov_b32 s33, s32 -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Reload -; FLATSCR: s_mov_b32 s33, s0 +; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF: s_waitcnt vmcnt(0) -; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; MUBUF: v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]] +; MUBUF: s_mov_b32 s33, [[FP_SCRATCH_COPY]] +; FLATSCR: s_mov_b32 s33, s0 ; GCN: s_setpc_b64 ; MUBUF: ScratchSize: 8 ; FLATSCR: ScratchSize: 0 @@ -587,18 +590,21 @@ ; need to spill the FP to memory if there are no free lanes in the reserved ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: +; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; FLATSCR: s_mov_b32 s33, s0 +; MUBUF: s_mov_b32 s33, s32 ; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] +; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 offset:[[OFF:[0-9]+]] ; GCN-NOT: v_writelane_b32 v40, s33 -; MUBUF: s_mov_b32 s33, s32 -; FLATSCR: s_mov_b32 s33, s0 ; GCN-NOT: v_readlane_b32 s33, v40 -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] -; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; GCN-NOT: v_readlane_b32 s33, v40 +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 offset:[[OFF]] +; MUBUF: v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]] ; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] +; MUBUF: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", @@ -626,14 +632,14 @@ ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 s[0:1], -1 -; FLATSCR: s_mov_b64 exec, s[0:1] ; FLATSCR: s_mov_b32 s0, s33 ; FLATSCR: s_mov_b32 s33, s32 -; FLATSCR: s_mov_b32 s33, s0 -; FLATSCR: s_or_saveexec_b64 s[0:1], -1 +; GCN-NOT: v_writelane_b32 v40, s33 +; FLATSCR: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR: s_mov_b64 exec, s[2:3] +; FLATSCR: s_or_saveexec_b64 s[2:3], -1 ; GCN-NOT: v_readlane_b32 s33, v40 +; FLATSCR: s_mov_b32 s33, s0 ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", @@ -659,12 +665,14 @@ ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset -; MUBUF: s_xor_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 +; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; MUBUF: v_mov_b32_e32 v0, s33 +; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] ; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -419,15 +419,15 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN-DAG: s_addk_i32 s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-DAG: buffer_load_dword [[TMP_REG:v[0-9]+]], off, s[0:3], s33{{$}} ; GCN: buffer_store_dword [[TMP_REG]], off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 +; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_addk_i32 s32, 0xfc00{{$}} -; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -425,15 +425,15 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN-DAG: s_addk_i32 s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 +; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_addk_i32 s32, 0xfc00{{$}} -; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -27,14 +27,15 @@ ; GCN-LABEL: call_split_type_used_outside_block_v2f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 @@ -42,12 +43,13 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: @@ -63,14 +65,15 @@ ; GCN-LABEL: call_split_type_used_outside_block_v3f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 @@ -78,12 +81,13 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: @@ -99,14 +103,15 @@ ; GCN-LABEL: call_split_type_used_outside_block_v4f16: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 @@ -114,12 +119,13 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: @@ -135,14 +141,15 @@ ; GCN-LABEL: call_split_type_used_outside_block_struct: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 @@ -151,12 +158,13 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -16,10 +16,12 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2 @@ -33,8 +35,6 @@ ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_writelane_b32 v42, s33, 0 -; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13 ; CHECK-NEXT: v_writelane_b32 v40, s46, 14 @@ -48,6 +48,7 @@ ; CHECK-NEXT: v_writelane_b32 v40, s47, 15 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v42, s16, 0 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v41, v31 ; CHECK-NEXT: s_mov_b32 s42, s15 @@ -91,12 +92,13 @@ ; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v42, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: v_readlane_b32 s33, v42, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .Ltmp2: diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; The custom CSR spills inserted during the frame lowering was earlier using SP as the frame base. +; The offsets allocated for the CS objects go wrong when any local stack object has a higher +; alignment requirement than the default stack alignment for AMDGPU (either 4 or 16). The offsets +; in such cases should be from the newly aligned FP. Even to adjust the offset from the SP value +; at function entry, the FP-SP can't be statically determined with dynamic stack realignment. To +; fix the problem, use FP as the frame base in the spills whenever the function has FP. + +define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { +; GCN-LABEL: test_stack_realign: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: v_writelane_b32 v43, s16, 0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v42, s30, 0 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_writelane_b32 v42, s31, 1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s31, v42, 1 +; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: v_readlane_b32 s4, v43, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0xd000 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %alloca.val = alloca <8 x i32>, align 64, addrspace(5) + store volatile <8 x i32> %val, ptr addrspace(5) %alloca.val, align 64 + call void asm sideeffect "", "~{v40}" () + call void asm sideeffect "", "~{v41}" () + call void @extern_func(i32 %idx) + ret void +} + +declare void @extern_func(i32) #0 + +attributes #0 = { noinline nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -10,15 +10,16 @@ ; SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: ; SPILL-TO-VGPR: ; %bb.0: ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s33, 0 +; SPILL-TO-VGPR-NEXT: s_mov_b32 s4, s33 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[8:9], -1 +; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s4, 0 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) @@ -28,21 +29,23 @@ ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v41, 0 +; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 +; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v41, 0 -; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] ; ; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call: ; NO-SPILL-TO-VGPR: ; %bb.0: ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s4, s33 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 +; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s4 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[10:11], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 @@ -67,7 +70,7 @@ ; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -75,8 +78,8 @@ ; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v2, 0 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -84,11 +87,12 @@ ; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v1, 0 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] +; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xf800 -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0 +; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s4, v0 +; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 ; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -8,8 +8,10 @@ ; SDAG-LABEL: gfx_func: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, s33 +; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 -; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; SDAG-NEXT: s_mov_b64 exec, s[34:35] ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 @@ -31,8 +33,6 @@ ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 -; SDAG-NEXT: s_mov_b32 s36, s33 -; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 ; SDAG-NEXT: v_writelane_b32 v40, s25, 21 @@ -77,19 +77,21 @@ ; SDAG-NEXT: v_readlane_b32 s6, v40, 2 ; SDAG-NEXT: v_readlane_b32 s5, v40, 1 ; SDAG-NEXT: v_readlane_b32 s4, v40, 0 -; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 -; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] +; SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: gfx_func: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, s33 +; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[34:35] ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 @@ -111,8 +113,6 @@ ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 -; GISEL-NEXT: s_mov_b32 s36, s33 -; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 ; GISEL-NEXT: v_writelane_b32 v40, s25, 21 @@ -157,11 +157,11 @@ ; GISEL-NEXT: v_readlane_b32 s6, v40, 2 ; GISEL-NEXT: v_readlane_b32 s5, v40, 1 ; GISEL-NEXT: v_readlane_b32 s4, v40, 0 -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] +; GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void @extern_c_func() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -97,15 +97,16 @@ ; GFX9-LABEL: test_call_external_void_func_i1_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 @@ -114,12 +115,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -127,32 +129,34 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -160,31 +164,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -192,32 +198,34 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i1(i1 true) @@ -228,16 +236,17 @@ ; GFX9-LABEL: test_call_external_void_func_i1_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 @@ -247,12 +256,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -260,17 +270,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 @@ -280,14 +291,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -295,17 +307,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 @@ -315,13 +328,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -329,17 +343,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 @@ -349,14 +364,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %var = load volatile i1, ptr addrspace(1) undef @@ -368,16 +384,17 @@ ; GFX9-LABEL: test_call_external_void_func_i1_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 @@ -387,12 +404,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -400,17 +418,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 @@ -420,14 +439,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -435,17 +455,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 @@ -455,13 +476,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -469,17 +491,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 @@ -489,14 +512,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %var = load volatile i1, ptr addrspace(1) undef @@ -508,15 +532,16 @@ ; GFX9-LABEL: test_call_external_void_func_i8_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 @@ -524,12 +549,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -537,31 +563,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -569,31 +597,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -601,31 +631,33 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i8(i8 123) @@ -636,16 +668,17 @@ ; GFX9-LABEL: test_call_external_void_func_i8_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 @@ -653,12 +686,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -666,17 +700,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext@rel32@hi+12 @@ -684,14 +719,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -699,17 +735,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 @@ -718,13 +755,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -732,17 +770,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 @@ -750,14 +789,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %var = load volatile i8, ptr addrspace(1) undef @@ -769,16 +809,17 @@ ; GFX9-LABEL: test_call_external_void_func_i8_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 @@ -786,12 +827,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -799,17 +841,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext@rel32@hi+12 @@ -817,14 +860,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -832,17 +876,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 @@ -851,13 +896,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -865,17 +911,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 @@ -883,14 +930,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %var = load volatile i8, ptr addrspace(1) undef @@ -902,15 +950,16 @@ ; GFX9-LABEL: test_call_external_void_func_i16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 @@ -918,12 +967,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -931,31 +981,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -963,31 +1015,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -995,31 +1049,33 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i16(i16 123) @@ -1030,16 +1086,17 @@ ; GFX9-LABEL: test_call_external_void_func_i16_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 @@ -1047,12 +1104,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1060,17 +1118,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext@rel32@hi+12 @@ -1078,14 +1137,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1093,17 +1153,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 @@ -1112,13 +1173,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1126,17 +1188,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 @@ -1144,14 +1207,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %var = load volatile i16, ptr addrspace(1) undef @@ -1163,16 +1227,17 @@ ; GFX9-LABEL: test_call_external_void_func_i16_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 @@ -1180,12 +1245,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1193,17 +1259,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext@rel32@hi+12 @@ -1211,14 +1278,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1226,17 +1294,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 @@ -1245,13 +1314,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1259,17 +1329,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 @@ -1277,14 +1348,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %var = load volatile i16, ptr addrspace(1) undef @@ -1296,15 +1368,16 @@ ; GFX9-LABEL: test_call_external_void_func_i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 @@ -1312,12 +1385,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1325,31 +1399,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1357,31 +1433,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1389,31 +1467,33 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i32(i32 42) @@ -1424,16 +1504,17 @@ ; GFX9-LABEL: test_call_external_void_func_i64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 @@ -1441,12 +1522,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1454,17 +1536,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 @@ -1472,14 +1555,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1487,16 +1571,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 @@ -1505,13 +1590,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1519,17 +1605,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 @@ -1537,14 +1624,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i64(i64 123) @@ -1555,17 +1643,18 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1573,12 +1662,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1586,33 +1676,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1620,33 +1712,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1654,33 +1748,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i64>, ptr addrspace(1) null @@ -1692,18 +1788,19 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1711,12 +1808,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1724,19 +1822,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1744,14 +1843,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1759,17 +1859,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 @@ -1778,13 +1879,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1792,19 +1894,20 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 @@ -1812,14 +1915,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> ) @@ -1830,19 +1934,20 @@ ; GFX9-LABEL: test_call_external_void_func_v3i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 @@ -1850,12 +1955,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1863,20 +1969,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 @@ -1884,14 +1991,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1899,33 +2007,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,20 +2043,21 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 @@ -1954,14 +2065,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %load = load <2 x i64>, ptr addrspace(1) null @@ -1975,21 +2087,22 @@ ; GFX9-LABEL: test_call_external_void_func_v4i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: v_mov_b32_e32 v6, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 @@ -1997,12 +2110,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2010,11 +2124,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 @@ -2023,9 +2139,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v7, 4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 @@ -2033,14 +2148,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2048,19 +2164,20 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 @@ -2069,13 +2186,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2083,11 +2201,13 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -2096,9 +2216,8 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 @@ -2106,14 +2225,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %load = load <2 x i64>, ptr addrspace(1) null @@ -2126,15 +2246,16 @@ ; GFX9-LABEL: test_call_external_void_func_f16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 @@ -2142,12 +2263,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2155,31 +2277,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2187,31 +2311,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2219,31 +2345,33 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_f16(half 4.0) @@ -2254,15 +2382,16 @@ ; GFX9-LABEL: test_call_external_void_func_f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 @@ -2270,12 +2399,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2283,31 +2413,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2315,31 +2447,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2347,31 +2481,33 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_f32(float 4.0) @@ -2382,16 +2518,17 @@ ; GFX9-LABEL: test_call_external_void_func_v2f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 @@ -2399,12 +2536,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2412,17 +2550,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 @@ -2430,14 +2569,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2445,16 +2585,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 @@ -2463,13 +2604,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2477,17 +2619,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 @@ -2495,14 +2638,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2f32(<2 x float> ) @@ -2513,17 +2657,18 @@ ; GFX9-LABEL: test_call_external_void_func_v3f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 @@ -2531,12 +2676,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2544,18 +2690,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 @@ -2563,14 +2710,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2578,17 +2726,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 @@ -2597,13 +2746,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2611,18 +2761,19 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 @@ -2630,14 +2781,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3f32(<3 x float> ) @@ -2648,12 +2800,12 @@ ; GFX9-LABEL: test_call_external_void_func_v5f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2661,6 +2813,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 @@ -2668,12 +2821,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2681,20 +2835,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 @@ -2702,14 +2857,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2717,18 +2873,19 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 @@ -2737,13 +2894,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,20 +2909,21 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 @@ -2772,14 +2931,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v5f32(<5 x float> ) @@ -2790,16 +2950,17 @@ ; GFX9-LABEL: test_call_external_void_func_f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 @@ -2807,12 +2968,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2820,17 +2982,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 @@ -2838,14 +3001,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2853,16 +3017,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 @@ -2871,13 +3036,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2885,17 +3051,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 @@ -2903,14 +3070,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_f64(double 4.0) @@ -2921,18 +3089,19 @@ ; GFX9-LABEL: test_call_external_void_func_v2f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 @@ -2940,12 +3109,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2953,19 +3123,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 @@ -2973,14 +3144,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,17 +3160,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 @@ -3007,13 +3180,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3021,19 +3195,20 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 @@ -3041,14 +3216,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2f64(<2 x double> ) @@ -3059,12 +3235,12 @@ ; GFX9-LABEL: test_call_external_void_func_v3f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3073,6 +3249,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f64@rel32@lo+4 @@ -3080,12 +3257,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3093,11 +3271,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 @@ -3105,9 +3285,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64@rel32@lo+4 @@ -3115,14 +3294,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3130,18 +3310,19 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 @@ -3150,13 +3331,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,11 +3346,13 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 @@ -3176,9 +3360,8 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 @@ -3186,14 +3369,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3f64(<3 x double> ) @@ -3204,15 +3388,16 @@ ; GFX9-LABEL: test_call_external_void_func_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 @@ -3220,12 +3405,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3233,16 +3419,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16@rel32@hi+12 @@ -3250,14 +3437,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3265,16 +3453,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 @@ -3283,13 +3472,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3297,16 +3487,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 @@ -3314,14 +3505,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i16>, ptr addrspace(1) undef @@ -3333,15 +3525,16 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3349,12 +3542,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3362,16 +3556,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 @@ -3379,14 +3574,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3394,16 +3590,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 @@ -3412,13 +3609,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3426,16 +3624,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 @@ -3443,14 +3642,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i16>, ptr addrspace(1) undef @@ -3462,15 +3662,16 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3478,12 +3679,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3491,16 +3693,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 @@ -3508,14 +3711,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3523,16 +3727,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 @@ -3541,13 +3746,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3555,16 +3761,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 @@ -3572,14 +3779,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <3 x half>, ptr addrspace(1) undef @@ -3591,16 +3799,17 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3608,12 +3817,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3621,17 +3831,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3639,14 +3850,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3654,16 +3866,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 @@ -3672,13 +3885,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3686,17 +3900,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 @@ -3704,14 +3919,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> ) @@ -3722,16 +3938,17 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3739,12 +3956,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3752,17 +3970,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3770,14 +3989,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3785,17 +4005,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 @@ -3804,13 +4025,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3818,17 +4040,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 @@ -3836,14 +4059,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3f16(<3 x half> ) @@ -3854,15 +4078,16 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -3870,12 +4095,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3883,16 +4109,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 @@ -3900,14 +4127,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3915,16 +4143,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 @@ -3933,13 +4162,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3947,16 +4177,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 @@ -3964,14 +4195,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i16>, ptr addrspace(1) undef @@ -3983,16 +4215,17 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -4000,12 +4233,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4013,17 +4247,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -4031,14 +4266,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4046,17 +4282,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 @@ -4065,13 +4302,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4079,17 +4317,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 @@ -4097,14 +4336,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> ) @@ -4115,15 +4355,16 @@ ; GFX9-LABEL: test_call_external_void_func_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 @@ -4131,12 +4372,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4144,16 +4386,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16@rel32@hi+12 @@ -4161,14 +4404,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4176,16 +4420,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 @@ -4194,13 +4439,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4208,16 +4454,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 @@ -4225,14 +4472,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x half>, ptr addrspace(1) undef @@ -4244,15 +4492,16 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4260,12 +4509,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4273,16 +4523,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 @@ -4290,14 +4541,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4305,16 +4557,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 @@ -4323,13 +4576,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4337,16 +4591,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 @@ -4354,14 +4609,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i32>, ptr addrspace(1) undef @@ -4373,16 +4629,17 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4390,12 +4647,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4403,17 +4661,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4421,14 +4680,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4436,16 +4696,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 @@ -4454,13 +4715,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4468,17 +4730,18 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 @@ -4486,14 +4749,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> ) @@ -4504,17 +4768,18 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 @@ -4522,12 +4787,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4535,18 +4801,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 @@ -4554,14 +4821,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4569,17 +4837,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 @@ -4588,13 +4857,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4602,18 +4872,19 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 @@ -4621,14 +4892,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> ) @@ -4639,18 +4911,19 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 6 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 @@ -4658,12 +4931,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4671,19 +4945,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 @@ -4691,14 +4966,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4706,17 +4982,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 @@ -4725,13 +5002,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4739,19 +5017,20 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 @@ -4759,14 +5038,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> , i32 6) @@ -4777,15 +5057,16 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -4793,12 +5074,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4806,16 +5088,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 @@ -4823,14 +5106,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4838,16 +5122,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 @@ -4856,13 +5141,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4870,16 +5156,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 @@ -4887,14 +5174,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i32>, ptr addrspace(1) undef @@ -4906,18 +5194,19 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -4925,12 +5214,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4938,19 +5228,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -4958,14 +5249,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4973,17 +5265,18 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 @@ -4992,13 +5285,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5006,19 +5300,20 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 @@ -5026,14 +5321,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> ) @@ -5044,12 +5340,12 @@ ; GFX9-LABEL: test_call_external_void_func_v5i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5057,6 +5353,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 @@ -5064,12 +5361,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5077,20 +5375,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 @@ -5098,14 +5397,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5113,18 +5413,19 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 @@ -5133,13 +5434,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5147,20 +5449,21 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 @@ -5168,14 +5471,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> ) @@ -5186,19 +5490,20 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5206,12 +5511,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5219,16 +5525,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -5241,14 +5548,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5256,16 +5564,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -5279,13 +5588,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5293,16 +5603,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 @@ -5315,14 +5626,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(1), ptr addrspace(4) undef @@ -5335,12 +5647,12 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5351,6 +5663,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, 6 ; GFX9-NEXT: v_mov_b32_e32 v6, 7 ; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5358,12 +5671,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5371,11 +5685,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 @@ -5385,9 +5701,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5395,14 +5710,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5410,19 +5726,20 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 @@ -5431,13 +5748,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5445,11 +5763,13 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 @@ -5459,9 +5779,8 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 @@ -5469,14 +5788,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> ) @@ -5487,21 +5807,22 @@ ; GFX9-LABEL: test_call_external_void_func_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4 @@ -5509,12 +5830,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5522,16 +5844,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 @@ -5546,14 +5869,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5561,16 +5885,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -5586,13 +5911,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5600,16 +5926,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 @@ -5624,14 +5951,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(1), ptr addrspace(4) undef @@ -5644,15 +5972,17 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[34:35] offset:16 @@ -5663,7 +5993,6 @@ ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[34:35] offset:96 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[34:35] offset:112 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4 @@ -5671,12 +6000,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5684,16 +6014,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -5712,14 +6043,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5727,16 +6059,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 @@ -5756,13 +6089,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5770,16 +6104,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 @@ -5798,14 +6133,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(1), ptr addrspace(4) undef @@ -5818,15 +6154,17 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[34:35] offset:16 @@ -5837,7 +6175,6 @@ ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[34:35] offset:96 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[34:35] offset:112 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -5848,12 +6185,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5861,16 +6199,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5892,14 +6231,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5907,16 +6247,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: global_load_b32 v32, v[0:1], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5938,13 +6279,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5952,16 +6294,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -5983,14 +6326,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef @@ -6004,18 +6348,19 @@ ; GFX9-LABEL: test_call_external_i32_func_i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v43, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -6028,12 +6373,13 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v43, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6041,21 +6387,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v43, s33, 0 +; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v43, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 @@ -6067,14 +6414,15 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v43, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v43, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6082,13 +6430,13 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v43, s33, 0 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:12 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 @@ -6096,6 +6444,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v43, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 @@ -6109,13 +6458,14 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v43, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v43, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:12 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6123,21 +6473,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s32 offset:12 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s33, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 @@ -6149,14 +6500,15 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v43, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v43, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42) @@ -6168,19 +6520,20 @@ ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v2, s[34:35] ; GFX9-NEXT: global_load_dword v1, v2, s[34:35] offset:4 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32@rel32@lo+4 @@ -6188,12 +6541,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6201,16 +6555,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -6223,14 +6578,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6238,16 +6594,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -6261,13 +6618,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,16 +6633,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 @@ -6297,14 +6656,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef @@ -6317,12 +6677,12 @@ ; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 @@ -6330,6 +6690,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 @@ -6337,12 +6698,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6350,35 +6712,37 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6386,35 +6750,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 -; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6422,35 +6788,37 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = alloca { i8, i32 }, align 4, addrspace(5) @@ -6466,12 +6834,12 @@ ; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 @@ -6481,6 +6849,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 @@ -6490,17 +6859,18 @@ ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6508,21 +6878,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 @@ -6534,19 +6905,20 @@ ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6554,15 +6926,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:20 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:20 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -6579,18 +6952,19 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:20 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:20 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6598,16 +6972,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:20 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:20 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -6624,19 +6999,20 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:20 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:20 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %in.val = alloca { i8, i32 }, align 4, addrspace(5) @@ -6660,19 +7036,20 @@ ; GFX9-LABEL: test_call_external_void_func_v16i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v16i8@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v16i8@rel32@hi+12 @@ -6698,12 +7075,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6711,16 +7089,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6751,14 +7130,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6766,16 +7146,17 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6803,13 +7184,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6817,16 +7199,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -6857,14 +7240,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(1), ptr addrspace(4) undef @@ -6877,11 +7261,11 @@ ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 @@ -6958,11 +7342,11 @@ ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6970,12 +7354,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s6, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 @@ -7053,12 +7437,12 @@ ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7066,11 +7450,11 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:24 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v31, off, s33 @@ -7145,11 +7529,11 @@ ; GFX11-NEXT: v_readlane_b32 s34, v40, 2 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: s_mov_b32 s33, s4 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:24 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7157,12 +7541,12 @@ ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:24 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 @@ -7237,12 +7621,12 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:24 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -7256,15 +7640,16 @@ ; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 @@ -7273,12 +7658,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7286,32 +7672,34 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7319,31 +7707,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7351,32 +7741,34 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg true) @@ -7387,16 +7779,17 @@ ; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 @@ -7405,12 +7798,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7418,33 +7812,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7452,33 +7848,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7486,33 +7884,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg 123) @@ -7523,16 +7923,17 @@ ; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 @@ -7541,12 +7942,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7554,33 +7956,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7588,33 +7992,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7622,33 +8028,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg 123) @@ -7659,16 +8067,17 @@ ; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 42 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 @@ -7677,12 +8086,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7690,33 +8100,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 42 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7724,33 +8136,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 42 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7758,33 +8172,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg 42) @@ -7795,18 +8211,19 @@ ; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 @@ -7816,12 +8233,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7829,21 +8247,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -7851,14 +8270,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7866,21 +8286,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7889,13 +8310,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7903,21 +8325,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7925,14 +8348,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg 123) @@ -7943,18 +8367,19 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 @@ -7968,12 +8393,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7981,15 +8407,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -8007,14 +8434,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8022,15 +8450,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -8048,14 +8477,15 @@ ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8063,15 +8493,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -8089,14 +8520,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i64>, ptr addrspace(4) null @@ -8108,22 +8540,23 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8135,12 +8568,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8148,21 +8582,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8176,14 +8611,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8191,21 +8627,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8220,13 +8657,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8234,21 +8672,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8262,14 +8701,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg ) @@ -8280,20 +8720,21 @@ ; GFX9-LABEL: test_call_external_void_func_v3i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s8, 1 @@ -8311,12 +8752,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8324,15 +8766,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -8356,14 +8799,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8371,15 +8815,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -8404,13 +8849,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8418,15 +8864,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -8450,14 +8897,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %load = load <2 x i64>, ptr addrspace(4) null @@ -8471,13 +8919,16 @@ ; GFX9-LABEL: test_call_external_void_func_v4i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 @@ -8485,8 +8936,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s8, 1 @@ -8508,12 +8957,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8521,15 +8971,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -8559,14 +9010,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8574,15 +9026,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -8613,13 +9066,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8627,15 +9081,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -8665,14 +9120,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %load = load <2 x i64>, ptr addrspace(4) null @@ -8685,16 +9141,17 @@ ; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 @@ -8703,12 +9160,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8716,33 +9174,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8750,33 +9210,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x4400 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8784,33 +9246,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_f16_inreg(half inreg 4.0) @@ -8821,16 +9285,17 @@ ; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 @@ -8839,12 +9304,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8852,33 +9318,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 4.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8886,33 +9354,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 4.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8920,33 +9390,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_f32_inreg(float inreg 4.0) @@ -8957,18 +9429,19 @@ ; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 @@ -8978,12 +9451,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8991,21 +9465,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -9013,14 +9488,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9028,21 +9504,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9051,13 +9528,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9065,21 +9543,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9087,14 +9566,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg ) @@ -9105,20 +9585,21 @@ ; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9129,12 +9610,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9142,21 +9624,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -9167,14 +9650,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9182,21 +9666,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 3 @@ -9208,13 +9693,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9222,21 +9708,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -9247,14 +9734,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg ) @@ -9265,17 +9753,17 @@ ; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s4, 1.0 @@ -9283,6 +9771,7 @@ ; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_mov_b32 s7, -1.0 ; GFX9-NEXT: s_mov_b32 s8, 0.5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 @@ -9295,12 +9784,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9308,21 +9798,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9339,14 +9830,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9354,21 +9846,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -9386,13 +9879,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9400,21 +9894,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9431,14 +9926,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg ) @@ -9449,18 +9945,19 @@ ; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 @@ -9470,12 +9967,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9483,21 +9981,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -9505,14 +10004,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9520,21 +10020,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40100000 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9543,13 +10044,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9557,21 +10059,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9579,14 +10082,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_f64_inreg(double inreg 4.0) @@ -9597,22 +10101,23 @@ ; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 @@ -9624,12 +10129,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9637,21 +10143,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9665,14 +10172,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9680,21 +10188,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -9709,13 +10218,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9723,21 +10233,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9751,14 +10262,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg ) @@ -9769,18 +10281,18 @@ ; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s4, 0 @@ -9789,6 +10301,7 @@ ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_mov_b32 s9, 0x40200000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 @@ -9802,12 +10315,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9815,21 +10329,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -9849,14 +10364,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9864,21 +10380,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -9899,13 +10416,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9913,21 +10431,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -9947,14 +10466,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg ) @@ -9965,16 +10485,17 @@ ; GFX9-LABEL: test_call_external_void_func_v2i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 @@ -9983,12 +10504,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9996,33 +10518,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10030,33 +10554,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10064,33 +10590,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i16>, ptr addrspace(4) undef @@ -10102,17 +10630,18 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -10122,12 +10651,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10135,15 +10665,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10156,14 +10687,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10171,15 +10703,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10193,13 +10726,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10207,15 +10741,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10228,14 +10763,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i16>, ptr addrspace(4) undef @@ -10247,17 +10783,18 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -10267,12 +10804,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10280,15 +10818,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10301,14 +10840,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10316,15 +10856,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10338,13 +10879,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10352,15 +10894,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10373,14 +10916,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <3 x half>, ptr addrspace(4) undef @@ -10392,18 +10936,19 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -10413,12 +10958,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10426,21 +10972,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 3 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -10448,14 +10995,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10463,21 +11011,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 3 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 3 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10486,13 +11035,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10500,21 +11050,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10522,14 +11073,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg ) @@ -10540,18 +11092,19 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -10561,12 +11114,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10574,21 +11128,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -10596,14 +11151,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10611,21 +11167,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_movk_i32 s5, 0x4400 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_movk_i32 s5, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10634,13 +11191,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10648,21 +11206,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10670,14 +11229,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg ) @@ -10688,17 +11248,18 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -10708,12 +11269,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10721,15 +11283,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10742,14 +11305,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10757,15 +11321,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10779,13 +11344,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10793,15 +11359,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10814,14 +11381,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i16>, ptr addrspace(4) undef @@ -10833,18 +11401,19 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -10854,12 +11423,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10867,21 +11437,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -10889,14 +11460,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10904,21 +11476,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40003 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40003 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10927,13 +11500,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10941,21 +11515,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -10963,14 +11538,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg ) @@ -10981,16 +11557,17 @@ ; GFX9-LABEL: test_call_external_void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 @@ -10999,12 +11576,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11012,33 +11590,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11046,33 +11626,35 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11080,33 +11662,35 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x half>, ptr addrspace(4) undef @@ -11118,17 +11702,18 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -11138,12 +11723,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11151,15 +11737,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -11172,14 +11759,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11187,15 +11775,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -11209,13 +11798,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11223,15 +11813,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -11244,14 +11835,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i32>, ptr addrspace(4) undef @@ -11263,18 +11855,19 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -11284,12 +11877,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11297,21 +11891,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -11319,14 +11914,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11334,21 +11930,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -11357,13 +11954,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11371,21 +11969,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -11393,14 +11992,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg ) @@ -11411,20 +12011,21 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 @@ -11435,12 +12036,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11448,21 +12050,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 @@ -11473,14 +12076,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11488,21 +12092,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 5 ; GFX11-NEXT: v_writelane_b32 v40, s30, 3 @@ -11514,13 +12119,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11528,21 +12134,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 @@ -11553,14 +12160,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg ) @@ -11571,22 +12179,23 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_mov_b32 s7, 6 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -11598,12 +12207,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11611,21 +12221,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11639,14 +12250,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11654,21 +12266,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 5 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11683,13 +12296,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11697,21 +12311,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11725,14 +12340,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg , i32 inreg 6) @@ -11743,19 +12359,20 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -11767,12 +12384,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11780,15 +12398,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11805,14 +12424,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11820,15 +12440,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11846,13 +12467,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11860,15 +12482,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11885,14 +12508,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i32>, ptr addrspace(4) undef @@ -11904,22 +12528,23 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -11931,12 +12556,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11944,21 +12570,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11972,14 +12599,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11987,21 +12615,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12016,13 +12645,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12030,21 +12660,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12058,14 +12689,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg ) @@ -12076,17 +12708,17 @@ ; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s4, 1 @@ -12094,6 +12726,7 @@ ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 @@ -12106,12 +12739,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12119,21 +12753,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12150,14 +12785,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12165,21 +12801,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12197,13 +12834,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12211,21 +12849,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12242,14 +12881,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg ) @@ -12260,12 +12900,15 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -12275,8 +12918,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 @@ -12294,12 +12935,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12307,15 +12949,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12342,14 +12985,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12357,15 +13001,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12393,13 +13038,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12407,15 +13053,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12442,14 +13089,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(4), ptr addrspace(4) undef @@ -12462,10 +13110,12 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12474,8 +13124,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s4, 1 @@ -12486,6 +13134,7 @@ ; GFX9-NEXT: s_mov_b32 s9, 6 ; GFX9-NEXT: s_mov_b32 s10, 7 ; GFX9-NEXT: s_mov_b32 s11, 8 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 @@ -12501,12 +13150,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12514,21 +13164,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12554,14 +13205,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12569,21 +13221,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12610,13 +13263,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12624,21 +13278,22 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12664,14 +13319,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg ) @@ -12682,10 +13338,12 @@ ; GFX9-LABEL: test_call_external_void_func_v16i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12696,6 +13354,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 @@ -12705,8 +13364,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 @@ -12732,12 +13389,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12745,15 +13403,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12796,14 +13455,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12811,15 +13471,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12863,13 +13524,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12877,15 +13539,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12928,14 +13591,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(4), ptr addrspace(4) undef @@ -12948,10 +13612,12 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12967,6 +13633,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 @@ -12980,8 +13647,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -13042,12 +13707,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -13055,15 +13721,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -13151,14 +13818,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13166,15 +13834,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -13257,13 +13926,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13271,15 +13941,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -13363,14 +14034,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr = load ptr addrspace(4), ptr addrspace(4) undef @@ -13383,10 +14055,12 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13401,6 +14075,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 @@ -13416,8 +14091,6 @@ ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -13482,12 +14155,13 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -13495,15 +14169,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -13596,14 +14271,15 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13611,15 +14287,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -13705,13 +14382,14 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13719,15 +14397,16 @@ ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -13816,14 +14495,15 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] %ptr0 = load ptr addrspace(4), ptr addrspace(4) undef @@ -13837,16 +14517,17 @@ ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 @@ -13858,12 +14539,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -13871,18 +14553,19 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 @@ -13894,14 +14577,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13909,16 +14593,17 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 @@ -13928,13 +14613,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13942,16 +14628,17 @@ ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 @@ -13961,14 +14648,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -13980,12 +14668,12 @@ ; GFX9-LABEL: stack_12xv3i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -14028,6 +14716,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_mov_b32_e32 v31, 11 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_12xv3i32@rel32@lo+4 @@ -14035,12 +14724,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14048,16 +14738,16 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: v_mov_b32_e32 v2, 14 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 @@ -14097,6 +14787,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_12xv3i32@rel32@lo+4 @@ -14104,14 +14795,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14119,21 +14811,21 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1 -; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 ; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2 ; GFX11-NEXT: v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3 ; GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3 @@ -14147,6 +14839,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v26, 8 :: v_dual_mov_b32 v27, 9 ; GFX11-NEXT: v_dual_mov_b32 v28, 9 :: v_dual_mov_b32 v29, 9 ; GFX11-NEXT: v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 @@ -14155,13 +14848,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14169,26 +14863,26 @@ ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 2 @@ -14215,6 +14909,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 @@ -14222,14 +14917,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -14253,12 +14949,12 @@ ; GFX9-LABEL: stack_8xv5i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -14309,6 +15005,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_mov_b32_e32 v31, 7 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_8xv5i32@rel32@lo+4 @@ -14316,12 +15013,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14329,24 +15027,24 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 11 ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 -; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -14386,6 +15084,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-NEXT: v_mov_b32_e32 v31, 7 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_8xv5i32@rel32@lo+4 @@ -14393,14 +15092,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14408,17 +15108,17 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 ; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 ; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_clause 0x1 @@ -14440,6 +15140,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v27, 5 ; GFX11-NEXT: v_dual_mov_b32 v28, 5 :: v_dual_mov_b32 v29, 5 ; GFX11-NEXT: v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 @@ -14448,13 +15149,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14462,11 +15164,13 @@ ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 @@ -14475,8 +15179,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 @@ -14513,6 +15215,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 @@ -14520,14 +15223,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -14547,12 +15251,12 @@ ; GFX9-LABEL: stack_8xv5f32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -14603,6 +15307,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_8xv5f32@rel32@lo+4 @@ -14610,12 +15315,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14623,24 +15329,24 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -14680,6 +15386,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_8xv5f32@rel32@lo+4 @@ -14687,14 +15394,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14702,11 +15410,13 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000 @@ -14715,8 +15425,6 @@ ; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000 ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_clause 0x1 @@ -14740,6 +15448,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 @@ -14748,13 +15457,14 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14762,11 +15472,13 @@ ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000 @@ -14775,8 +15487,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 @@ -14813,6 +15523,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 @@ -14820,14 +15531,15 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -9,16 +9,17 @@ ; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -31,12 +32,13 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -44,15 +46,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -67,14 +70,15 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -82,15 +86,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_getpc_b64 s[4:5] ; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -106,13 +111,14 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_void() @@ -220,15 +226,16 @@ ; GFX9-LABEL: test_call_void_func_void_mayclobber_s31: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s31 @@ -245,12 +252,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -258,15 +266,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -284,14 +293,15 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -299,15 +309,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 @@ -325,13 +336,14 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %s31 = call i32 asm sideeffect "; def $0", "={s31}"() @@ -344,14 +356,15 @@ ; GFX9-LABEL: test_call_void_func_void_mayclobber_v31: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v42, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -369,12 +382,13 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v42, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -382,20 +396,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v42, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v41, v31 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -408,14 +423,15 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v42, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v42, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -423,20 +439,21 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v42, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v41, v31 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -450,13 +467,14 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v42, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %v31 = call i32 asm sideeffect "; def $0", "={v31}"() @@ -470,15 +488,16 @@ ; GFX9-LABEL: test_call_void_func_void_preserves_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s33 @@ -495,12 +514,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -508,22 +528,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -534,14 +555,15 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -549,22 +571,23 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s33 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -576,13 +599,14 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %s33 = call i32 asm sideeffect "; def $0", "={s33}"() @@ -595,13 +619,14 @@ ; GFX9-LABEL: test_call_void_func_void_preserves_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -620,12 +645,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -633,22 +659,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s34 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] @@ -659,14 +686,15 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -674,22 +702,23 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s34 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s34 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -701,13 +730,14 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %s34 = call i32 asm sideeffect "; def $0", "={s34}"() @@ -720,14 +750,15 @@ ; GFX9-LABEL: test_call_void_func_void_preserves_v40: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v42, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -743,12 +774,13 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v42, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -756,20 +788,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v41, s30, 0 -; GFX10-NEXT: v_writelane_b32 v42, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -780,14 +813,15 @@ ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v42, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v42, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -795,20 +829,21 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v41, s30, 0 -; GFX11-NEXT: v_writelane_b32 v42, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 @@ -820,13 +855,14 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v42, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %v40 = call i32 asm sideeffect "; def $0", "={v40}"() @@ -963,14 +999,15 @@ ; GFX9-LABEL: test_call_void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 @@ -978,12 +1015,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -991,15 +1029,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s33@rel32@hi+12 @@ -1007,14 +1046,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1022,15 +1062,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s33@rel32@hi+12 @@ -1039,13 +1080,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @void_func_void_clobber_s33() @@ -1056,14 +1098,15 @@ ; GFX9-LABEL: test_call_void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 @@ -1071,12 +1114,13 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1084,15 +1128,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s34@rel32@hi+12 @@ -1100,14 +1145,15 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1115,15 +1161,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s34@rel32@hi+12 @@ -1132,13 +1179,14 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @void_func_void_clobber_s34() @@ -1149,15 +1197,16 @@ ; GFX9-LABEL: callee_saved_sgpr_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 @@ -1173,12 +1222,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1186,22 +1236,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1211,14 +1262,15 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1226,22 +1278,23 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -1252,13 +1305,14 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 @@ -1271,15 +1325,16 @@ ; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v42, s33, 0 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART @@ -1304,12 +1359,13 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v42, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1317,29 +1373,30 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s34, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v42, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_mov_b32_e32 v41, v32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 @@ -1351,14 +1408,15 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v42, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: v_readlane_b32 s34, v42, 0 +; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1366,30 +1424,30 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v42, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: v_mov_b32_e32 v41, v32 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s4 @@ -1401,13 +1459,14 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: v_readlane_b32 s33, v42, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -24,11 +24,11 @@ ; GFX9-LABEL: call_i1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 @@ -40,11 +40,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -52,12 +52,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_mov_b32 s36, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 @@ -69,12 +69,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -82,11 +82,11 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i1@gotpcrel32@lo+4 @@ -99,11 +99,11 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -132,11 +132,11 @@ ; GFX9-LABEL: call_i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 @@ -148,11 +148,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -160,12 +160,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_mov_b32 s36, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 @@ -177,12 +177,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -190,11 +190,11 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i16@gotpcrel32@lo+4 @@ -207,11 +207,11 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -240,11 +240,11 @@ ; GFX9-LABEL: call_2xi16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 @@ -256,11 +256,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -268,12 +268,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_mov_b32 s36, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 @@ -285,12 +285,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -298,11 +298,11 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16@gotpcrel32@lo+4 @@ -315,11 +315,11 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -357,11 +357,11 @@ ; GFX9-LABEL: call_3xi16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 @@ -373,11 +373,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -385,12 +385,12 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_mov_b32 s36, s33 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 @@ -402,12 +402,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -415,11 +415,11 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16@gotpcrel32@lo+4 @@ -432,11 +432,11 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -1638,12 +1638,12 @@ ; GFX9-LABEL: call_512xi32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000 +; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x60000 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 @@ -1656,11 +1656,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 -; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 +; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1668,14 +1668,14 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 +; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_mov_b32 s36, s33 -; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 ; GFX10-NEXT: s_add_i32 s32, s32, 0x30000 -; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 @@ -1687,12 +1687,12 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 -; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 -; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 +; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1700,13 +1700,14 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v5, s32 offset:2048 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s34, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x7ff -; GFX11-NEXT: s_addk_i32 s32, 0x1800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v5, s33 offset:2048 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_addk_i32 s32, 0x1800 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_512xi32@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_512xi32@gotpcrel32@hi+12 @@ -1719,11 +1720,11 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xe800 -; GFX11-NEXT: s_mov_b32 s33, s34 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:2048 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v5, off, s33 offset:2048 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_addk_i32 s32, 0xe800 +; GFX11-NEXT: s_mov_b32 s33, s34 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -393,12 +393,13 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -465,24 +466,26 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v41, s33, 0 +; GISEL-NEXT: s_mov_b32 s16, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[18:19] +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -549,12 +552,13 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v41, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void %fptr() @@ -565,12 +569,13 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -640,24 +645,26 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v41, s33, 0 +; GISEL-NEXT: s_mov_b32 s16, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[18:19] +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -725,12 +732,13 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v41, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void %fptr(i32 123) @@ -741,12 +749,13 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_ret: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -815,24 +824,26 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_ret: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v41, s33, 0 +; GISEL-NEXT: s_mov_b32 s16, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[18:19] +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -901,12 +912,13 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v41, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %a = call i32 %fptr() @@ -918,12 +930,13 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -1001,24 +1014,26 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_in_branch: ; GISEL: ; %bb.0: ; %bb0 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[16:17] -; GISEL-NEXT: v_writelane_b32 v41, s33, 0 +; GISEL-NEXT: s_mov_b32 s16, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[18:19] +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -1096,12 +1111,13 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: v_readlane_b32 s33, v41, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] bb0: @@ -1119,11 +1135,11 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -1202,22 +1218,22 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s5 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_mov_b32 s5, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -1296,11 +1312,11 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s5 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 inreg 123) @@ -1311,11 +1327,11 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -1398,22 +1414,22 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 @@ -1496,11 +1512,11 @@ ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 ; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1515,11 +1531,11 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -1600,22 +1616,22 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -1696,11 +1712,11 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1712,11 +1728,11 @@ ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -1794,22 +1810,22 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -1887,11 +1903,11 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -188,16 +188,17 @@ ; GFX9-LABEL: slsr1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v44, s33, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v44, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s36, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 @@ -229,12 +230,13 @@ ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v44, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v44, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %b = and i32 %b.arg, 16777215 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,11 +27,11 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 @@ -50,11 +50,11 @@ ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -152,11 +152,11 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1 @@ -170,11 +170,11 @@ ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -186,11 +186,11 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s7, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v2, s30, 0 ; CHECK-NEXT: v_writelane_b32 v2, s31, 1 @@ -204,11 +204,11 @@ ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s7 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 +; CHECK-NEXT: s_mov_b32 s33, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -11,13 +11,13 @@ ; GCN: s_waitcnt ; Spill CSR VGPR used for SGPR spilling -; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v41, s33, 0 -; GCN-DAG: s_mov_b32 s33, s32 -; GCN-DAG: s_addk_i32 s32, 0x400 +; GCN-DAG: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: v_writelane_b32 v40, s30, 0 ; GCN-DAG: v_writelane_b32 v40, s31, 1 @@ -26,12 +26,13 @@ ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 0 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_i32_imm() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -13,12 +13,13 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[16:17] -; CHECK-NEXT: v_writelane_b32 v41, s33, 0 +; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v41, s16, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 @@ -38,13 +39,14 @@ ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 +; CHECK-NEXT: v_readlane_b32 s4, v41, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: v_readlane_b32 s33, v41, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .Ltmp2: diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -29,13 +29,14 @@ ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs ; CHECK: liveins: $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; CHECK-NEXT: $sgpr4 = COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -49,12 +50,13 @@ ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc - ; CHECK-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -25,13 +25,14 @@ ; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs ; MUBUF: liveins: $vgpr1, $vgpr2 ; MUBUF-NEXT: {{ $}} - ; MUBUF-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -39,23 +40,25 @@ ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; MUBUF-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs ; FLATSCR: liveins: $vgpr1, $vgpr2 ; FLATSCR-NEXT: {{ $}} - ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc - ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc + ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc @@ -64,12 +67,13 @@ ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc + ; FLATSCR-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc + ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc - ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -24,13 +24,14 @@ ; CHECK-LABEL: name: scavenge_sgpr_pei ; CHECK: liveins: $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 262400, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; CHECK-NEXT: $sgpr4 = COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262080, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc + ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -39,12 +40,13 @@ ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc + ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -786432, implicit-def dead $scc - ; CHECK-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 262400, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -26,13 +26,14 @@ ; GFX8-LABEL: name: pei_scavenge_vgpr_spill ; GFX8: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX8-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; GFX8-NEXT: $sgpr4 = COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX8-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192 @@ -41,57 +42,62 @@ ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384 ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX8-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; GFX8-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc - ; GFX8-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; GFX8-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX8-NEXT: $sgpr33 = COPY $sgpr4 ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill ; GFX9: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; GFX9-NEXT: $sgpr4 = COPY $sgpr33 ; GFX9-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX9-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX9-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX9-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc - ; GFX9-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc - ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr33 = COPY $sgpr4 ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill ; GFX9-FLATSCR: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 ; GFX9-FLATSCR-NEXT: {{ $}} - ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc - ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) - ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX9-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; GFX9-FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc + ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr33, 8192, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc + ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc - ; GFX9-FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc - ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) - ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; GFX9-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,11 +16,11 @@ ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -263,11 +263,11 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -308,11 +308,11 @@ ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill @@ -553,11 +553,11 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -198,12 +198,14 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: -; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 [[CSRV_1]], s33, 0 ; GCN-DAG: s_addk_i32 s32, 0x800 +; GCN: v_writelane_b32 [[CSRV_1]], [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -223,16 +225,15 @@ ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 - -; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0 -; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1 - -; GCN: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 +; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV_1]], 0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -1,21 +1,23 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}spill_csr_s5_copy: +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v41, s33, 0 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v41, 0 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec +; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -156,31 +156,34 @@ ; The BP value is saved/restored with a VGPR spill. ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: -; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 +; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 -; GCN-NEXT: v_writelane_b32 [[VGPR_REG_1]], s33, 0 -; GCN-DAG: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN: v_mov_b32_e32 v32, 0 ; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-DAG: s_add_i32 s32, s32, 0x30000 +; GCN: v_writelane_b32 [[VGPR_REG_1]], [[FP_SCRATCH_COPY]], 0 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN: s_swappc_b64 s[30:31], ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1 -; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG_1]], 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG_1]], 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 +; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 1024 @@ -197,9 +200,9 @@ ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: -; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 ; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 @@ -208,8 +211,8 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen ; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_add_i32 s32, s32, 0xfffd0000 -; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] +; GCN: s_mov_b32 s34, [[BP_COPY]] +; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 s[30:31] begin: @@ -261,12 +264,13 @@ ; If there are no free SGPRs or VGPRs available we must spill the BP to memory. ; GCN-LABEL: no_free_regs_spill_bp_to_mem -; GCN: s_xor_saveexec_b64 s[4:5], -1 -; GCN: buffer_store_dword v39, off, s[0:3], s32 +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN: s_xor_saveexec_b64 s[6:7], -1 +; GCN: buffer_store_dword v39, off, s[0:3], s33 ; GCN: v_mov_b32_e32 v0, s34 -; GCN: buffer_store_dword v0, off, s[0:3], s32 -; GCN: v_mov_b32_e32 v0, s33 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 +; GCN: buffer_store_dword v0, off, s[0:3], s33 +; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33 %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -295,19 +299,22 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset -; GCN: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_add_i32 s6, s32, 0x42100 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOT: v_mov_b32_e32 v0, 0x1088 -; GCN-NEXT: s_add_i32 s6, s32, 0x42300 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x108c -; GCN-NEXT: s_add_i32 s6, s32, 0x42200 +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] +; GCN-NOT: v_mov_b32_e32 v0, 0x1088 +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -17,12 +17,12 @@ ; GCN-LABEL: caller: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s36, s33 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: s_mov_b32 s36, s33 -; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v1, s30, 1 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 @@ -35,11 +35,11 @@ ; GCN-NEXT: v_readlane_b32 s31, v1, 2 ; GCN-NEXT: v_readlane_b32 s30, v1, 1 ; GCN-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s36 ; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s33, s36 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %add = fadd float %arg0, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -8,13 +8,14 @@ ; GFX90A-LABEL: widget: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: s_mov_b64 exec, s[16:17] -; GFX90A-NEXT: v_writelane_b32 v41, s33, 0 +; GFX90A-NEXT: s_mov_b32 s16, s33 ; GFX90A-NEXT: s_mov_b32 s33, s32 +; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: s_addk_i32 s32, 0x400 +; GFX90A-NEXT: v_writelane_b32 v41, s16, 0 ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, wobble@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -6,12 +6,13 @@ ; GCN-LABEL: widget: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v42, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v42, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -119,12 +120,13 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v42, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v42, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .LBB0_9: ; %bb2 @@ -263,12 +265,13 @@ ; GCN-LABEL: blam: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v46, s33, 0 +; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v46, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x800 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir --- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir +++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir @@ -39,13 +39,14 @@ ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; MUBUF-NEXT: liveins: $vgpr1, $vgpr2 ; MUBUF-NEXT: {{ $}} - ; MUBUF-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 9961728, implicit-def dead $scc - ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc + ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) + ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -66,25 +67,27 @@ ; MUBUF-NEXT: bb.2: ; MUBUF-NEXT: liveins: $vgpr2 ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc + ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) + ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -11010048, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; MUBUF-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 9961728, implicit-def dead $scc - ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 ; MUBUF-NEXT: S_ENDPGM 0 ; FLATSCR-LABEL: name: use_restore_frame_reg ; FLATSCR: bb.0: ; FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; FLATSCR-NEXT: liveins: $vgpr1, $vgpr2 ; FLATSCR-NEXT: {{ $}} - ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 155652, implicit-def dead $scc - ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc + ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5) + ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc ; FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -114,12 +117,13 @@ ; FLATSCR-NEXT: bb.2: ; FLATSCR-NEXT: liveins: $vgpr2 ; FLATSCR-NEXT: {{ $}} + ; FLATSCR-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0 + ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc + ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5) + ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -172032, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 - ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 155652, implicit-def dead $scc - ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; FLATSCR-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -11,12 +11,12 @@ ; GFX9-LABEL: non_preserved_vgpr_tuple8: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v45, s33, 0 +; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 ; GFX9-NEXT: v_mov_b32_e32 v34, v14 @@ -36,6 +36,7 @@ ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v45, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -54,12 +55,13 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v45, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v45, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -67,18 +69,18 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: s_mov_b32 s4, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s5, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v36, v16 ; GFX10-NEXT: v_mov_b32_e32 v35, v15 ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: v_writelane_b32 v45, s33, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -93,7 +95,7 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v45, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -113,14 +115,15 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v45, 0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: v_readlane_b32 s4, v45, 0 +; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -128,16 +131,16 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:20 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 -; GFX11-NEXT: v_writelane_b32 v45, s33, 0 -; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 @@ -153,6 +156,7 @@ ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v45, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 @@ -170,13 +174,14 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v45, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v45, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:20 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -207,12 +212,12 @@ ; GFX9-LABEL: call_preserved_vgpr_tuple8: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v46, s33, 0 +; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill @@ -225,6 +230,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v41, v12 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v46, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -244,12 +250,13 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v46, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v46, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -257,13 +264,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v46, s33, 0 +; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_or_saveexec_b32 s5, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill @@ -271,7 +278,7 @@ ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v46, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -296,14 +303,15 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v46, 0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: v_readlane_b32 s4, v46, 0 +; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -311,13 +319,13 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:24 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v46, s33, 0 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 @@ -326,6 +334,7 @@ ; GFX11-NEXT: scratch_store_b32 off, v45, s33 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v46, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 @@ -348,13 +357,14 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v46, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_readlane_b32 s0, v46, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:24 -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1108,39 +1108,39 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt ; GCN-NEXT: s_waitcnt_vscnt - +; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}} -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v41, s33, 0 -; GCN: s_mov_b32 s33, s32 ; GFX1064: s_addk_i32 s32, 0x400 ; GFX1032: s_addk_i32 s32, 0x200 - +; GCN-NEXT: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: v_writelane_b32 v40, s30, 0 + ; GCN-DAG: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 ; GCN-DAG: v_readlane_b32 s30, v40, 0 ; GCN-DAG: v_readlane_b32 s31, v40, 1 - -; GFX1064: s_addk_i32 s32, 0xfc00 -; GFX1032: s_addk_i32 s32, 0xfe00 -; GCN: v_readlane_b32 s33, v41, 0 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] +; GFX1064: s_addk_i32 s32, 0xfc00 +; GFX1032: s_addk_i32 s32, 0xfe00 +; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_with_call() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -332,13 +332,13 @@ ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s35, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 @@ -370,27 +370,27 @@ ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s35 -; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_call: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O3-NEXT: s_mov_b32 s38, s33 +; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-O3-NEXT: s_mov_b32 s38, s33 -; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 @@ -410,13 +410,13 @@ ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) @@ -516,24 +516,24 @@ ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s42, s33 +; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_mov_b32 s42, s33 -; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 @@ -598,50 +598,50 @@ ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: s_mov_b32 s33, s42 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 +; GFX9-O0-NEXT: s_mov_b32 s33, s42 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O3-NEXT: s_mov_b32 s40, s33 +; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 -; GFX9-O3-NEXT: s_mov_b32 s40, s33 -; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -671,22 +671,22 @@ ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 -; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: s_mov_b32 s33, s40 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-O3-NEXT: s_mov_b32 s33, s40 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)