diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -69,6 +69,10 @@ Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const; + void emitPrologueEntryCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const; + public: bool hasFP(const MachineFunction &MF) const override; @@ -76,6 +80,12 @@ /// Create a CFI index for CFIInst and build a MachineInstr around it. void buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const; + /// Create a CFI index describing a spill of an SGPR to a single lane of + /// a VGPR and build a MachineInstr around it. + void buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const Register SGPR, + const Register VGPR, const int Lane) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -11,12 +11,14 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCDwarf.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Support/LEB128.h" using namespace llvm; @@ -880,6 +882,64 @@ return ScratchExecCopy; } +void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const { + const MachineFunction &MF = *MBB.getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + Register StackPtrReg = + MF.getInfo()->getStackPtrOffsetReg(); + + // DW_ASPACE_AMDGPU_private_wave FIXME: should be defined elsewhere + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createLLVMDefAspaceCfa( + nullptr, MCRI->getDwarfRegNum(StackPtrReg, false), 0, 6)); + + static const char PCEncodedInst[] = { + dwarf::DW_CFA_expression, + 16, // PC 64 + 8, // length + static_cast(dwarf::DW_OP_regx), + 62, // SGPR30 + static_cast(dwarf::DW_OP_piece), + 4, // 32 bits + static_cast(dwarf::DW_OP_regx), + 63, // SGPR31 + static_cast(dwarf::DW_OP_piece), + 4 // 32 bits + }; + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape( + nullptr, StringRef(PCEncodedInst, sizeof(PCEncodedInst)))); + + BitVector IsCalleeSaved(TRI.getNumRegs()); + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + for (unsigned I = 0; CSRegs[I]; ++I) { + IsCalleeSaved.set(CSRegs[I]); + } + auto ProcessReg = [&](MCPhysReg Reg) { + if (IsCalleeSaved.test(Reg) || !MRI.isPhysRegModified(Reg)) + return; + MCRegister DwarfReg = MCRI->getDwarfRegNum(Reg, false); + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createUndefined(nullptr, DwarfReg)); + }; + + // Emit CFI rules for caller saved Arch VGPRs which are clobbered + for_each(AMDGPU::VGPR_32RegClass.getRegisters(), ProcessReg); + + // Emit CFI rules for caller saved Accum VGPRs which are clobbered + if (ST.hasMAIInsts()) { + for_each(AMDGPU::AGPR_32RegClass.getRegisters(), ProcessReg); + } + + // Emit CFI rules for caller saved SGPRs which are clobbered + for_each(AMDGPU::SGPR_32RegClass.getRegisters(), ProcessReg); +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -893,6 +953,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); Register FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -911,6 +972,8 @@ // turn on all lanes before doing the spill to memory. Register ScratchExecCopy; + emitPrologueEntryCFI(MBB, MBBI, DL); + bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); bool SpillFPToMemory = false; // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. @@ -937,14 +1000,21 @@ if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + int FI = Reg.FI.getValue(); + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - StackPtrReg, - Reg.FI.getValue()); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + + // We spill the entire VGPR, so we can get away with just cfi_offset + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(Reg.VGPR, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } if (HasFPSaveIndex && SpillFPToMemory) { - assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); + const int FI = *FuncInfo->FramePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(FI)); if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); @@ -958,12 +1028,16 @@ .addReg(FramePtrReg); buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - FuncInfo->FramePointerSaveIndex.getValue()); + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(FramePtrReg, false), + MFI.getObjectOffset(FI) * ST.getWavefrontSize())); } if (HasBPSaveIndex && SpillBPToMemory) { - assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex)); + const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(BasePtrFI)); if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); @@ -977,8 +1051,11 @@ .addReg(BasePtrReg); buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - *FuncInfo->BasePointerSaveIndex); + FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset( + nullptr, MCRI->getDwarfRegNum(BasePtrReg, false), + MFI.getObjectOffset(BasePtrFI) * ST.getWavefrontSize())); } if (ScratchExecCopy) { @@ -992,7 +1069,7 @@ // In this case, spill the FP to a reserved VGPR. if (HasFPSaveIndex && !SpillFPToMemory) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + const int FI = *FuncInfo->FramePointerSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); @@ -1006,6 +1083,9 @@ .addReg(FramePtrReg) .addImm(Spill[0].Lane) .addReg(Spill[0].VGPR, RegState::Undef); + + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, FramePtrReg, Spill[0].VGPR, + Spill[0].Lane); } // In this case, spill the BP to a reserved VGPR. @@ -1024,6 +1104,8 @@ .addReg(BasePtrReg) .addImm(Spill[0].Lane) .addReg(Spill[0].VGPR, RegState::Undef); + buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, BasePtrReg, Spill[0].VGPR, + Spill[0].Lane); } // Emit the copy if we need an FP, and are using a free SGPR to save it. @@ -1032,6 +1114,11 @@ FuncInfo->SGPRForFPSaveRestoreCopy) .addReg(FramePtrReg) .setMIFlag(MachineInstr::FrameSetup); + buildCFI( + MBB, MBBI, DL, + MCCFIInstruction::createRegister( + nullptr, MCRI->getDwarfRegNum(FramePtrReg, false), + MCRI->getDwarfRegNum(FuncInfo->SGPRForFPSaveRestoreCopy, false))); } // Emit the copy if we need a BP, and are using a free SGPR to save it. @@ -1040,6 +1127,11 @@ FuncInfo->SGPRForBPSaveRestoreCopy) .addReg(BasePtrReg) .setMIFlag(MachineInstr::FrameSetup); + buildCFI( + MBB, MBBI, DL, + MCCFIInstruction::createRegister( + nullptr, MCRI->getDwarfRegNum(BasePtrReg, false), + MCRI->getDwarfRegNum(FuncInfo->SGPRForBPSaveRestoreCopy, false))); } // If a copy has been emitted for FP and/or BP, Make the SGPRs @@ -1101,6 +1193,12 @@ .setMIFlag(MachineInstr::FrameSetup); } + if (HasFP) { + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaRegister( + nullptr, MCRI->getDwarfRegNum(FramePtrReg, false))); + } + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) @@ -1134,6 +1232,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); LivePhysRegs LiveRegs; @@ -1184,7 +1283,7 @@ Register ScratchExecCopy; if (HasFPSaveIndex) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + const int FI = *FuncInfo->FramePointerSaveIndex; assert(!MFI.isDeadObjectIndex(FI)); if (SpillFPToMemory) { if (!ScratchExecCopy) @@ -1210,6 +1309,11 @@ } } + if (hasFP(MF)) + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaRegister( + nullptr, MCRI->getDwarfRegNum(StackPtrReg, false))); + if (HasBPSaveIndex) { const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); @@ -1541,3 +1645,56 @@ .addCFIIndex(MF.addFrameInst(CFIInst)) .setMIFlag(MachineInstr::FrameSetup); } + +static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) { + if (DwarfReg < 32) { + OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg); + } else { + OS << uint8_t(dwarf::DW_OP_regx); + encodeULEB128(DwarfReg, OS); + } +} + +void SIFrameLowering::buildCFIForSGPRToVGPRSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const Register SGPR, const Register VGPR, + const int Lane) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false); + int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false); + + // CFI for an SGPR spilled to a single lane of a VGPR is implemented as an + // expression(E) rule where E is a register location description referencing + // a VGPR register location storage at a byte offset of the lane index + // multiplied by the size of an SGPR (4 bytes). In other words we generate + // the following DWARF: + // + // DW_CFA_expression: , + // (DW_OP_regx ) (DW_OP_LLVM_offset_uconst *4) + // + // The memory location description for the current CFA is pushed on the + // stack before E is evaluated, but we choose not to drop it as it would + // require a longer expression E and DWARF defines the result of the + // evaulation to be the location description on the top of the stack (i.e. the + // implictly pushed one is just ignored.) + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfSGPR, OSCFIInst); + + encodeDwarfRegisterLocation(DwarfVGPR, OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_offset_uconst); + // FIXME: Can this be a function of the SGPR? + const unsigned SGPRByteSize = 4; + encodeULEB128(Lane * SGPRByteSize, OSBlock); + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -289,14 +289,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_add_u32 s33, s32, 0x3e0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00 ; GFX10-NEXT: s_add_u32 s32, s32, 0x800 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -166,8 +166,8 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s33 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz BB2_3 diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -229,7 +229,7 @@ ; GCN-NEXT: s_mov_b64 exec, ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 -define void @spill_only_csr_sgpr() { +define void @spill_only_csr_sgpr() #0 { call void asm sideeffect "; clobber s42", "~{s42}"() ret void } @@ -270,9 +270,9 @@ ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 63 -; GCN-COUNT-60: v_writelane_b32 v1 ; GCN: s_mov_b32 s33, s32 -; GCN-COUNT-2: v_writelane_b32 v1 +; GCN-COUNT-60: v_writelane_b32 v1 +; GCN-COUNT-1: v_writelane_b32 v1 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 @@ -306,9 +306,9 @@ ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt -; GCN-COUNT-62: v_writelane_b32 v1, ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-COUNT-62: v_writelane_b32 v1, ; GCN: v_writelane_b32 v1, ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill @@ -346,10 +346,10 @@ ; GCN: s_waitcnt ; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 ; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; MUBUF-NEXT: s_add_u32 s33, s32, 0x7ffc0 -; FLATSCR-NEXT: s_add_u32 s33, s32, 0x1fff -; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 -; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 +; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff +; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 +; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000 ; FLATSCR-NEXT: s_add_u32 s32, s32, 0x4000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 @@ -369,19 +369,19 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 2 -; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v1, s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART -; MUBUF: v_readlane_b32 s4, v1, 0 -; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; MUBUF: s_add_u32 s32, s32, 0x200 +; MUBUF-NEXT: v_readlane_b32 s4, v1, 0 ; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 -; FLATSCR: v_readlane_b32 s0, v1, 0 -; FLATSCR-NEXT: s_add_u32 s32, s32, 8 +; FLATSCR: s_add_u32 s32, s32, 8 +; FLATSCR-NEXT: v_readlane_b32 s0, v1, 0 ; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 @@ -410,8 +410,8 @@ ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; MUBUF-DAG: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,8 +31,8 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 @@ -65,8 +65,8 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 @@ -99,8 +99,8 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 @@ -133,8 +133,8 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s ; CHECK-LABEL: kern1: ; CHECK: .cfi_startproc @@ -23,7 +24,510 @@ ret void } -attributes #0 = { nounwind } +; CHECK-LABEL: func_no_clobber: +; CHECK: .cfi_startproc + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; DW_CFA_expression [0x10] +; PC_64 ULEB128(17)=[0x10] +; BLOCK_LENGTH ULEB128(8)=[0x08] +; DW_OP_regx [0x90] +; SGPR30 ULEB128(62)=[0x3e] +; DW_OP_piece [0x93] +; PIECE_SIZE [0x04] +; DW_OP_regx [0x90] +; SGPR31 ULEB128(63)=[0x3f] +; DW_OP_piece [0x93] +; PIECE_SIZE [0x04] +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: .cfi_endproc +define hidden void @func_no_clobber() #0 { +entry: + ret void +} + +; CHECK-LABEL: {{^}}callee_need_to_spill_fp_to_memory: +; CHECK: .cfi_startproc + +; SGPR33 = 65 +; CHECK: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 +; CHECK: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; WAVE64: .cfi_offset 65, 28928 +; WAVE32: .cfi_offset 65, 14464 + +; CHECK: .cfi_endproc +define void @callee_need_to_spill_fp_to_memory() #1 { + call void asm sideeffect "; clobber nonpreserved SGPRs", + "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} + ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} + ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} + ,~{vcc}"() + + call void asm sideeffect "; clobber all VGPRs", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}"() + ret void +} + +declare hidden void @ex() #0 + +; CHECK-LABEL: func_call_clobber: +; CHECK: .cfi_startproc + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: %bb.0: +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + +; VGPR0_wave64 = 2560 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: .cfi_undefined 2585 +; WAVE64-NEXT: .cfi_undefined 2586 +; WAVE64-NEXT: .cfi_undefined 2587 +; WAVE64-NEXT: .cfi_undefined 2588 +; WAVE64-NEXT: .cfi_undefined 2589 +; WAVE64-NEXT: .cfi_undefined 2590 +; WAVE64-NEXT: .cfi_undefined 2591 +; WAVE64-NEXT: .cfi_undefined 2592 +; WAVE64-NEXT: .cfi_undefined 2593 +; WAVE64-NEXT: .cfi_undefined 2594 +; WAVE64-NEXT: .cfi_undefined 2595 +; WAVE64-NEXT: .cfi_undefined 2596 +; WAVE64-NEXT: .cfi_undefined 2597 +; WAVE64-NEXT: .cfi_undefined 2598 +; WAVE64-NEXT: .cfi_undefined 2599 + +; VPGR48_wave64 = 2608 +; WAVE64-NEXT: .cfi_undefined 2608 +; WAVE64-NEXT: .cfi_undefined 2609 +; WAVE64-NEXT: .cfi_undefined 2610 +; WAVE64-NEXT: .cfi_undefined 2611 +; WAVE64-NEXT: .cfi_undefined 2612 +; WAVE64-NEXT: .cfi_undefined 2613 +; WAVE64-NEXT: .cfi_undefined 2614 +; WAVE64-NEXT: .cfi_undefined 2615 + +; WAVE64-NEXT: .cfi_undefined 2624 +; WAVE64-NEXT: .cfi_undefined 2625 +; WAVE64-NEXT: .cfi_undefined 2626 +; WAVE64-NEXT: .cfi_undefined 2627 +; WAVE64-NEXT: .cfi_undefined 2628 +; WAVE64-NEXT: .cfi_undefined 2629 +; WAVE64-NEXT: .cfi_undefined 2630 +; WAVE64-NEXT: .cfi_undefined 2631 + +; WAVE64-NEXT: .cfi_undefined 2640 +; WAVE64-NEXT: .cfi_undefined 2641 +; WAVE64-NEXT: .cfi_undefined 2642 +; WAVE64-NEXT: .cfi_undefined 2643 +; WAVE64-NEXT: .cfi_undefined 2644 +; WAVE64-NEXT: .cfi_undefined 2645 +; WAVE64-NEXT: .cfi_undefined 2646 +; WAVE64-NEXT: .cfi_undefined 2647 + +; WAVE64-NEXT: .cfi_undefined 2656 +; WAVE64-NEXT: .cfi_undefined 2657 +; WAVE64-NEXT: .cfi_undefined 2658 +; WAVE64-NEXT: .cfi_undefined 2659 +; WAVE64-NEXT: .cfi_undefined 2660 +; WAVE64-NEXT: .cfi_undefined 2661 +; WAVE64-NEXT: .cfi_undefined 2662 +; WAVE64-NEXT: .cfi_undefined 2663 + +; WAVE64-NEXT: .cfi_undefined 2672 +; WAVE64-NEXT: .cfi_undefined 2673 +; WAVE64-NEXT: .cfi_undefined 2674 +; WAVE64-NEXT: .cfi_undefined 2675 +; WAVE64-NEXT: .cfi_undefined 2676 +; WAVE64-NEXT: .cfi_undefined 2677 +; WAVE64-NEXT: .cfi_undefined 2678 +; WAVE64-NEXT: .cfi_undefined 2679 + +; WAVE64-NEXT: .cfi_undefined 2688 +; WAVE64-NEXT: .cfi_undefined 2689 +; WAVE64-NEXT: .cfi_undefined 2690 +; WAVE64-NEXT: .cfi_undefined 2691 +; WAVE64-NEXT: .cfi_undefined 2692 +; WAVE64-NEXT: .cfi_undefined 2693 +; WAVE64-NEXT: .cfi_undefined 2694 +; WAVE64-NEXT: .cfi_undefined 2695 + +; WAVE64-NEXT: .cfi_undefined 2704 +; WAVE64-NEXT: .cfi_undefined 2705 +; WAVE64-NEXT: .cfi_undefined 2706 +; WAVE64-NEXT: .cfi_undefined 2707 +; WAVE64-NEXT: .cfi_undefined 2708 +; WAVE64-NEXT: .cfi_undefined 2709 +; WAVE64-NEXT: .cfi_undefined 2710 +; WAVE64-NEXT: .cfi_undefined 2711 + +; WAVE64-NEXT: .cfi_undefined 2720 +; WAVE64-NEXT: .cfi_undefined 2721 +; WAVE64-NEXT: .cfi_undefined 2722 +; WAVE64-NEXT: .cfi_undefined 2723 +; WAVE64-NEXT: .cfi_undefined 2724 +; WAVE64-NEXT: .cfi_undefined 2725 +; WAVE64-NEXT: .cfi_undefined 2726 +; WAVE64-NEXT: .cfi_undefined 2727 + +; WAVE64-NEXT: .cfi_undefined 2736 +; WAVE64-NEXT: .cfi_undefined 2737 +; WAVE64-NEXT: .cfi_undefined 2738 +; WAVE64-NEXT: .cfi_undefined 2739 +; WAVE64-NEXT: .cfi_undefined 2740 +; WAVE64-NEXT: .cfi_undefined 2741 +; WAVE64-NEXT: .cfi_undefined 2742 +; WAVE64-NEXT: .cfi_undefined 2743 + +; WAVE64-NEXT: .cfi_undefined 2752 +; WAVE64-NEXT: .cfi_undefined 2753 +; WAVE64-NEXT: .cfi_undefined 2754 +; WAVE64-NEXT: .cfi_undefined 2755 +; WAVE64-NEXT: .cfi_undefined 2756 +; WAVE64-NEXT: .cfi_undefined 2757 +; WAVE64-NEXT: .cfi_undefined 2758 +; WAVE64-NEXT: .cfi_undefined 2759 + +; WAVE64-NEXT: .cfi_undefined 2768 +; WAVE64-NEXT: .cfi_undefined 2769 +; WAVE64-NEXT: .cfi_undefined 2770 +; WAVE64-NEXT: .cfi_undefined 2771 +; WAVE64-NEXT: .cfi_undefined 2772 +; WAVE64-NEXT: .cfi_undefined 2773 +; WAVE64-NEXT: .cfi_undefined 2774 +; WAVE64-NEXT: .cfi_undefined 2775 + +; WAVE64-NEXT: .cfi_undefined 2784 +; WAVE64-NEXT: .cfi_undefined 2785 +; WAVE64-NEXT: .cfi_undefined 2786 +; WAVE64-NEXT: .cfi_undefined 2787 +; WAVE64-NEXT: .cfi_undefined 2788 +; WAVE64-NEXT: .cfi_undefined 2789 +; WAVE64-NEXT: .cfi_undefined 2790 +; WAVE64-NEXT: .cfi_undefined 2791 + +; WAVE64-NEXT: .cfi_undefined 2800 +; WAVE64-NEXT: .cfi_undefined 2801 +; WAVE64-NEXT: .cfi_undefined 2802 +; WAVE64-NEXT: .cfi_undefined 2803 +; WAVE64-NEXT: .cfi_undefined 2804 +; WAVE64-NEXT: .cfi_undefined 2805 +; WAVE64-NEXT: .cfi_undefined 2806 +; WAVE64-NEXT: .cfi_undefined 2807 + + +; VGPR0_wave32 = 1536 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 + +; VPGR48_wave64 = 1584 +; WAVE32-NEXT: .cfi_undefined 1584 +; WAVE32-NEXT: .cfi_undefined 1585 +; WAVE32-NEXT: .cfi_undefined 1586 +; WAVE32-NEXT: .cfi_undefined 1587 +; WAVE32-NEXT: .cfi_undefined 1588 +; WAVE32-NEXT: .cfi_undefined 1589 +; WAVE32-NEXT: .cfi_undefined 1590 +; WAVE32-NEXT: .cfi_undefined 1591 + +; WAVE32-NEXT: .cfi_undefined 1600 +; WAVE32-NEXT: .cfi_undefined 1601 +; WAVE32-NEXT: .cfi_undefined 1602 +; WAVE32-NEXT: .cfi_undefined 1603 +; WAVE32-NEXT: .cfi_undefined 1604 +; WAVE32-NEXT: .cfi_undefined 1605 +; WAVE32-NEXT: .cfi_undefined 1606 +; WAVE32-NEXT: .cfi_undefined 1607 + +; WAVE32-NEXT: .cfi_undefined 1616 +; WAVE32-NEXT: .cfi_undefined 1617 +; WAVE32-NEXT: .cfi_undefined 1618 +; WAVE32-NEXT: .cfi_undefined 1619 +; WAVE32-NEXT: .cfi_undefined 1620 +; WAVE32-NEXT: .cfi_undefined 1621 +; WAVE32-NEXT: .cfi_undefined 1622 +; WAVE32-NEXT: .cfi_undefined 1623 + +; WAVE32-NEXT: .cfi_undefined 1632 +; WAVE32-NEXT: .cfi_undefined 1633 +; WAVE32-NEXT: .cfi_undefined 1634 +; WAVE32-NEXT: .cfi_undefined 1635 +; WAVE32-NEXT: .cfi_undefined 1636 +; WAVE32-NEXT: .cfi_undefined 1637 +; WAVE32-NEXT: .cfi_undefined 1638 +; WAVE32-NEXT: .cfi_undefined 1639 + +; WAVE32-NEXT: .cfi_undefined 1648 +; WAVE32-NEXT: .cfi_undefined 1649 +; WAVE32-NEXT: .cfi_undefined 1650 +; WAVE32-NEXT: .cfi_undefined 1651 +; WAVE32-NEXT: .cfi_undefined 1652 +; WAVE32-NEXT: .cfi_undefined 1653 +; WAVE32-NEXT: .cfi_undefined 1654 +; WAVE32-NEXT: .cfi_undefined 1655 + +; WAVE32-NEXT: .cfi_undefined 1664 +; WAVE32-NEXT: .cfi_undefined 1665 +; WAVE32-NEXT: .cfi_undefined 1666 +; WAVE32-NEXT: .cfi_undefined 1667 +; WAVE32-NEXT: .cfi_undefined 1668 +; WAVE32-NEXT: .cfi_undefined 1669 +; WAVE32-NEXT: .cfi_undefined 1670 +; WAVE32-NEXT: .cfi_undefined 1671 + +; WAVE32-NEXT: .cfi_undefined 1680 +; WAVE32-NEXT: .cfi_undefined 1681 +; WAVE32-NEXT: .cfi_undefined 1682 +; WAVE32-NEXT: .cfi_undefined 1683 +; WAVE32-NEXT: .cfi_undefined 1684 +; WAVE32-NEXT: .cfi_undefined 1685 +; WAVE32-NEXT: .cfi_undefined 1686 +; WAVE32-NEXT: .cfi_undefined 1687 + +; WAVE32-NEXT: .cfi_undefined 1696 +; WAVE32-NEXT: .cfi_undefined 1697 +; WAVE32-NEXT: .cfi_undefined 1698 +; WAVE32-NEXT: .cfi_undefined 1699 +; WAVE32-NEXT: .cfi_undefined 1700 +; WAVE32-NEXT: .cfi_undefined 1701 +; WAVE32-NEXT: .cfi_undefined 1702 +; WAVE32-NEXT: .cfi_undefined 1703 + +; WAVE32-NEXT: .cfi_undefined 1712 +; WAVE32-NEXT: .cfi_undefined 1713 +; WAVE32-NEXT: .cfi_undefined 1714 +; WAVE32-NEXT: .cfi_undefined 1715 +; WAVE32-NEXT: .cfi_undefined 1716 +; WAVE32-NEXT: .cfi_undefined 1717 +; WAVE32-NEXT: .cfi_undefined 1718 +; WAVE32-NEXT: .cfi_undefined 1719 + +; WAVE32-NEXT: .cfi_undefined 1728 +; WAVE32-NEXT: .cfi_undefined 1729 +; WAVE32-NEXT: .cfi_undefined 1730 +; WAVE32-NEXT: .cfi_undefined 1731 +; WAVE32-NEXT: .cfi_undefined 1732 +; WAVE32-NEXT: .cfi_undefined 1733 +; WAVE32-NEXT: .cfi_undefined 1734 +; WAVE32-NEXT: .cfi_undefined 1735 + +; WAVE32-NEXT: .cfi_undefined 1744 +; WAVE32-NEXT: .cfi_undefined 1745 +; WAVE32-NEXT: .cfi_undefined 1746 +; WAVE32-NEXT: .cfi_undefined 1747 +; WAVE32-NEXT: .cfi_undefined 1748 +; WAVE32-NEXT: .cfi_undefined 1749 +; WAVE32-NEXT: .cfi_undefined 1750 +; WAVE32-NEXT: .cfi_undefined 1751 + +; WAVE32-NEXT: .cfi_undefined 1760 +; WAVE32-NEXT: .cfi_undefined 1761 +; WAVE32-NEXT: .cfi_undefined 1762 +; WAVE32-NEXT: .cfi_undefined 1763 +; WAVE32-NEXT: .cfi_undefined 1764 +; WAVE32-NEXT: .cfi_undefined 1765 +; WAVE32-NEXT: .cfi_undefined 1766 +; WAVE32-NEXT: .cfi_undefined 1767 + +; WAVE32-NEXT: .cfi_undefined 1776 +; WAVE32-NEXT: .cfi_undefined 1777 +; WAVE32-NEXT: .cfi_undefined 1778 +; WAVE32-NEXT: .cfi_undefined 1779 +; WAVE32-NEXT: .cfi_undefined 1780 +; WAVE32-NEXT: .cfi_undefined 1781 +; WAVE32-NEXT: .cfi_undefined 1782 +; WAVE32-NEXT: .cfi_undefined 1783 + + +; SGPR0 = 32 +; CHECK-NEXT: .cfi_undefined 32 +; CHECK-NEXT: .cfi_undefined 33 +; CHECK-NEXT: .cfi_undefined 34 +; CHECK-NEXT: .cfi_undefined 35 +; CHECK-NEXT: .cfi_undefined 36 +; CHECK-NEXT: .cfi_undefined 37 +; CHECK-NEXT: .cfi_undefined 38 +; CHECK-NEXT: .cfi_undefined 39 +; CHECK-NEXT: .cfi_undefined 40 +; CHECK-NEXT: .cfi_undefined 41 +; CHECK-NEXT: .cfi_undefined 42 +; CHECK-NEXT: .cfi_undefined 43 +; CHECK-NEXT: .cfi_undefined 44 +; CHECK-NEXT: .cfi_undefined 45 +; CHECK-NEXT: .cfi_undefined 46 +; CHECK-NEXT: .cfi_undefined 47 +; CHECK-NEXT: .cfi_undefined 48 +; CHECK-NEXT: .cfi_undefined 49 +; CHECK-NEXT: .cfi_undefined 50 +; CHECK-NEXT: .cfi_undefined 51 +; CHECK-NEXT: .cfi_undefined 52 +; CHECK-NEXT: .cfi_undefined 53 +; CHECK-NEXT: .cfi_undefined 54 +; CHECK-NEXT: .cfi_undefined 55 +; CHECK-NEXT: .cfi_undefined 56 +; CHECK-NEXT: .cfi_undefined 57 +; CHECK-NEXT: .cfi_undefined 58 +; CHECK-NEXT: .cfi_undefined 59 +; CHECK-NEXT: .cfi_undefined 60 +; CHECK-NEXT: .cfi_undefined 61 +; CHECK-NEXT: .cfi_undefined 62 +; CHECK-NEXT: .cfi_undefined 63 + +; CHECK-NOT: .cfi_{{.*}} + +; WAVE64: s_or_saveexec_b64 s[4:5], -1 +; WAVE32: s_or_saveexec_b32 s4, -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; VGPR40_wave64 = 2600 +; WAVE64-NEXT: .cfi_offset 2600, 0 +; VGPR40_wave32 = 1576 +; WAVE32-NEXT: .cfi_offset 1576, 0 +; CHECK-NOT: .cfi_{{.*}} +; WAVE64: s_mov_b64 exec, s[4:5] +; WAVE32: s_mov_b32 exec_lo, s4 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: v_writelane_b32 v40, s33, 2 + +; DW_CFA_expression [0x10] SGPR33 ULEB128(65)=[0x41] +; BLOCK_LENGTH ULEB128(5)=[0x05] +; DW_OP_regx [0x90] +; VGPR40_wave64 ULEB128(2600)=[0xa8, 0x14] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0x08) [0x08] +; WAVE64-NEXT: .cfi_escape 0x10, 0x41, 0x05, 0x90, 0xa8, 0x14, 0xe4, 0x08 + +; DW_CFA_expression [0x10] SGPR33 ULEB128(65)=[0x41] +; BLOCK_LENGTH ULEB128(5)=[0x05] +; DW_OP_regx [0x90] +; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0x08) [0x08] +; WAVE32-NEXT: .cfi_escape 0x10, 0x41, 0x05, 0x90, 0xa8, 0x0c, 0xe4, 0x08 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: s_mov_b32 s33, s32 +; SGPR33 = 65 +; CHECK-NEXT: .cfi_def_cfa_register 65 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: s_sub_u32 s32, s32, +; CHECK-NEXT: v_readlane_b32 s33, v40, 2 +; SGPR32 = 64 +; CHECK-NEXT: .cfi_def_cfa_register 64 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: .cfi_endproc +define hidden void @func_call_clobber() #0 { +entry: + call void @ex() #0 + ret void +} + +; NOTE: Number of VGPRs available to kernel, and in turn number of corresponding CFIs generated, +; is dependent on waves/WG size. Since the intent here is to check whether we generate the correct +; CFIs, doing it for any one set of details is sufficient which also makes the test insensitive to +; changes in those details. +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="128,128" } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="128,128" "frame-pointer"="all" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3} diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -99,8 +99,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -128,15 +128,15 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -159,11 +159,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 @@ -190,15 +190,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 @@ -225,11 +225,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 @@ -256,15 +256,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 @@ -292,8 +292,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -320,13 +320,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -350,11 +350,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 @@ -379,15 +379,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -412,11 +412,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 @@ -441,15 +441,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -475,8 +475,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -503,13 +503,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -533,11 +533,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 @@ -562,15 +562,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -595,11 +595,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 @@ -624,15 +624,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -658,8 +658,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -686,13 +686,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -717,8 +717,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -746,11 +746,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 @@ -777,12 +777,12 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 @@ -807,16 +807,16 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -842,8 +842,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -873,12 +873,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 @@ -906,12 +906,12 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 @@ -938,15 +938,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 @@ -976,12 +976,12 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 @@ -1010,16 +1010,16 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 4 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 @@ -1050,8 +1050,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -1078,13 +1078,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -1109,8 +1109,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -1137,13 +1137,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -1168,8 +1168,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -1197,11 +1197,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 @@ -1229,8 +1229,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -1259,11 +1259,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 @@ -1292,8 +1292,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -1324,13 +1324,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 @@ -1359,8 +1359,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -1388,11 +1388,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 @@ -1420,8 +1420,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -1451,12 +1451,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 @@ -1485,8 +1485,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -1518,14 +1518,14 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 @@ -1553,10 +1553,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 @@ -1581,14 +1581,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -1613,10 +1613,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 @@ -1641,14 +1641,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -1673,10 +1673,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 @@ -1701,14 +1701,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -1734,8 +1734,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 @@ -1763,11 +1763,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -1795,8 +1795,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 @@ -1824,11 +1824,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -1855,10 +1855,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 @@ -1883,14 +1883,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -1916,8 +1916,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 @@ -1945,11 +1945,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -1976,10 +1976,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 @@ -2004,14 +2004,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -2036,10 +2036,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 @@ -2064,14 +2064,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -2097,8 +2097,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -2126,11 +2126,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -2158,8 +2158,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 @@ -2188,11 +2188,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 @@ -2221,8 +2221,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 @@ -2252,12 +2252,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 @@ -2285,10 +2285,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 @@ -2313,14 +2313,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -2346,8 +2346,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -2377,12 +2377,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 @@ -2411,8 +2411,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -2443,13 +2443,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 @@ -2477,19 +2477,19 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2509,12 +2509,12 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2549,8 +2549,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -2584,16 +2584,16 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -2621,21 +2621,21 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2655,12 +2655,12 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 @@ -2696,11 +2696,13 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 @@ -2709,12 +2711,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2734,12 +2735,12 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -2779,11 +2780,13 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 @@ -2792,8 +2795,7 @@ ; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: global_load_dword v32, v[0:1], off @@ -2821,12 +2823,12 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2908,17 +2910,17 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v42, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_mov_b32_e32 v40, v0 -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v1 ; GFX10-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2947,19 +2949,19 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:4 ; GFX9-NEXT: global_load_ubyte v0, v2, s[4:5] -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2979,12 +2981,12 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -3051,17 +3053,17 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -3131,21 +3133,21 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x400 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 @@ -3164,6 +3166,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[4:5] %in.val = alloca { i8, i32 }, align 4, addrspace(5) %out.val = alloca { i8, i32 }, align 4, addrspace(5) @@ -3189,15 +3192,15 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 @@ -3239,12 +3242,12 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] @@ -3329,14 +3332,14 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x400 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 @@ -3368,8 +3371,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -3397,15 +3400,15 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -3429,8 +3432,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_getpc_b64 s[6:7] @@ -3457,13 +3460,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -3488,8 +3491,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_getpc_b64 s[6:7] @@ -3516,13 +3519,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -3547,8 +3550,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 42 ; GFX9-NEXT: s_getpc_b64 s[6:7] @@ -3575,13 +3578,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 42 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -3606,8 +3609,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 @@ -3635,11 +3638,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 @@ -3666,11 +3669,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 @@ -3695,15 +3698,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -3729,8 +3732,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -3760,12 +3763,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 @@ -3793,11 +3796,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 @@ -3824,13 +3827,13 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s8, 1 ; GFX10-NEXT: s_mov_b32 s9, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[10:11] ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 @@ -3861,11 +3864,11 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 @@ -3894,15 +3897,15 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s8, 1 ; GFX10-NEXT: s_mov_b32 s9, 2 ; GFX10-NEXT: s_mov_b32 s10, 3 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s11, 4 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[12:13] ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 @@ -3933,8 +3936,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 ; GFX9-NEXT: s_getpc_b64 s[6:7] @@ -3961,13 +3964,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -3992,8 +3995,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 4.0 ; GFX9-NEXT: s_getpc_b64 s[6:7] @@ -4020,13 +4023,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 4.0 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4051,8 +4054,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -4080,11 +4083,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 @@ -4112,8 +4115,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -4142,11 +4145,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_mov_b32 s6, 4.0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 @@ -4175,8 +4178,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -4207,13 +4210,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_mov_b32 s6, 4.0 ; GFX10-NEXT: s_mov_b32 s7, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s8, 0.5 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[10:11] ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 @@ -4242,8 +4245,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 @@ -4271,11 +4274,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 0x40100000 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 @@ -4303,8 +4306,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -4334,12 +4337,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 @@ -4368,8 +4371,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -4401,14 +4404,14 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_mov_b32 s9, 0x40200000 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[10:11] ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 @@ -4436,10 +4439,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 @@ -4464,14 +4467,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4496,10 +4499,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 @@ -4524,14 +4527,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4556,10 +4559,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 @@ -4584,14 +4587,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4617,8 +4620,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 @@ -4646,11 +4649,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: s_mov_b32 s5, 3 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 @@ -4678,8 +4681,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 @@ -4707,11 +4710,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-NEXT: s_movk_i32 s5, 0x4400 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 @@ -4738,10 +4741,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 @@ -4766,14 +4769,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4799,8 +4802,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 @@ -4828,11 +4831,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: s_mov_b32 s5, 0x40003 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 @@ -4859,10 +4862,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 @@ -4887,14 +4890,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4919,10 +4922,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 @@ -4947,14 +4950,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -4980,8 +4983,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -5009,11 +5012,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 @@ -5041,8 +5044,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 @@ -5071,11 +5074,11 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_mov_b32 s6, 5 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 @@ -5104,8 +5107,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 @@ -5135,12 +5138,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: s_mov_b32 s6, 5 ; GFX10-NEXT: s_mov_b32 s7, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -5168,10 +5171,10 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 @@ -5196,14 +5199,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -5229,8 +5232,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -5260,12 +5263,12 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[8:9] ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 @@ -5294,8 +5297,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -5326,13 +5329,13 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s8, 5 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[10:11] ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 @@ -5360,17 +5363,17 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[12:13] ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5390,14 +5393,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[12:13] ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 @@ -5426,8 +5429,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -5461,16 +5464,16 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s8, 5 ; GFX10-NEXT: s_mov_b32 s9, 6 ; GFX10-NEXT: s_mov_b32 s10, 7 ; GFX10-NEXT: s_mov_b32 s11, 8 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[12:13] ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 @@ -5498,17 +5501,17 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[20:21] ; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5528,14 +5531,14 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[20:21] ; GFX10-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 @@ -5564,6 +5567,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 18 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s36, 0 ; GFX9-NEXT: v_writelane_b32 v40, s37, 1 ; GFX9-NEXT: v_writelane_b32 v40, s38, 2 @@ -5584,10 +5588,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 @@ -5610,9 +5616,6 @@ ; GFX9-NEXT: s_mov_b32 s27, s43 ; GFX9-NEXT: s_mov_b32 s28, s44 ; GFX9-NEXT: s_mov_b32 s29, s45 -; GFX9-NEXT: s_getpc_b64 s[30:31] -; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s4, v40, 16 @@ -5650,10 +5653,10 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 @@ -5742,6 +5745,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 18 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s36, 0 ; GFX9-NEXT: v_writelane_b32 v40, s37, 1 ; GFX9-NEXT: v_writelane_b32 v40, s38, 2 @@ -5763,10 +5767,10 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, s22 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 @@ -5777,7 +5781,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s49 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s50 -; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; GFX9-NEXT: v_mov_b32_e32 v0, s51 ; GFX9-NEXT: s_mov_b32 s20, s36 @@ -5831,18 +5834,18 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s36, 0 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s36, 0 ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 +; GFX10-NEXT: v_writelane_b32 v40, s40, 4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s22 -; GFX10-NEXT: v_writelane_b32 v40, s40, 4 ; GFX10-NEXT: v_writelane_b32 v40, s41, 5 ; GFX10-NEXT: v_writelane_b32 v40, s42, 6 ; GFX10-NEXT: v_writelane_b32 v40, s43, 7 @@ -5962,14 +5965,14 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 @@ -6066,10 +6069,10 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: v_mov_b32_e32 v2, 14 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 @@ -6223,11 +6226,11 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -6385,11 +6388,11 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -12,10 +12,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -47,8 +47,8 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s35, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -108,9 +108,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART @@ -146,11 +146,11 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: ;;#ASMSTART @@ -187,8 +187,8 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 2 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART @@ -225,17 +225,17 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v41, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v31 ; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v40, v31 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_mov_b32_e32 v31, v40 ; GFX10-NEXT: ;;#ASMSTART @@ -303,11 +303,11 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s33, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND @@ -342,9 +342,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -378,11 +378,11 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND @@ -417,8 +417,8 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 2 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -453,12 +453,12 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v41, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND @@ -576,8 +576,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 @@ -604,11 +604,11 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -633,8 +633,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 @@ -661,11 +661,11 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -690,9 +690,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v40, s40, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -726,11 +726,11 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND @@ -765,9 +765,9 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 3 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_writelane_b32 v41, s40, 0 ; GFX9-NEXT: v_writelane_b32 v41, s30, 1 -; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART @@ -810,20 +810,20 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v41, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v41, s40, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v41, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v40, v32 ; GFX10-NEXT: v_writelane_b32 v41, s30, 1 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v40, v32 ; GFX10-NEXT: v_writelane_b32 v41, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -147,6 +147,7 @@ ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc ; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_local_stack_offset_uses_sp: @@ -184,6 +185,7 @@ ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %pin.low = alloca i32, align 8192, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -220,8 +220,8 @@ ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_mov_b32 s7, s33 -; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x400 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz BB2_3 @@ -256,8 +256,8 @@ ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s5, s33 -; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_add_u32 s32, s32, 16 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz BB2_3 diff --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll --- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll +++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll @@ -13,6 +13,8 @@ ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp0: ; GCN-NEXT: .loc 0 4 5 prologue_end ; /tmp/dbg.cl:4:5 @@ -35,6 +37,12 @@ ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 +; GCN-NEXT: .cfi_undefined 2560 +; GCN-NEXT: .cfi_undefined 2561 +; GCN-NEXT: .cfi_undefined 2562 +; GCN-NEXT: .cfi_undefined 2563 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp2: ; GCN-NEXT: .loc 0 8 17 prologue_end ; /tmp/dbg.cl:8:17 @@ -65,6 +73,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp8: ; GCN-NEXT: .loc 0 12 5 prologue_end ; /tmp/dbg.cl:12:5 @@ -83,6 +93,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp10: ; GCN-NEXT: .loc 0 16 5 prologue_end ; /tmp/dbg.cl:16:5 @@ -103,6 +115,8 @@ ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp12: ; GCN-NEXT: .loc 0 20 5 prologue_end ; /tmp/dbg.cl:20:5 @@ -121,6 +135,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp14: ; GCN-NEXT: .loc 0 24 5 prologue_end ; /tmp/dbg.cl:24:5 @@ -139,6 +155,8 @@ ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GCN-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp16: ; GCN-NEXT: .loc 0 28 5 prologue_end ; /tmp/dbg.cl:28:5 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -125,13 +125,13 @@ ; GCN-LABEL: {{^}}default_realign_align128: ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_u32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_add_u32 s32, s32, 0x4000 -; GCN-NOT: s33 +; GCN-NEXT: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, [[TMP]], 0xffffe000 +; GCN: s_add_u32 s32, s32, 0x4000 ; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} ; GCN: s_sub_u32 s32, s32, 0x4000 ; GCN: s_mov_b32 s33, [[FP_COPY]] +; GCN-NOT: s33 define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 @@ -194,10 +194,10 @@ ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_u32 s33, s32, 0xffc0 ; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 +; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 ; GCN: s_add_u32 s32, s32, 0x30000 @@ -234,9 +234,8 @@ ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: ; GCN: ; %bb.0: ; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN: s_mov_b32 s34, s32 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/DebugInfo/AMDGPU/cfi.ll b/llvm/test/DebugInfo/AMDGPU/cfi.ll --- a/llvm/test/DebugInfo/AMDGPU/cfi.ll +++ b/llvm/test/DebugInfo/AMDGPU/cfi.ll @@ -21,6 +21,11 @@ ; CHECK-EMPTY: ; CHECK-NEXT: 00000020 {{[0-9]+}} 00000000 FDE cie=00000000 pc=00000000...{{[0-9]+}} ; CHECK-NEXT: Format: DWARF32 +; CHECK-NEXT: DW_CFA_LLVM_def_aspace_cfa: SGPR32_LO16 +0 as6 +; CHECK-NEXT: DW_CFA_expression: PC_REG DW_OP_regx SGPR30_LO16, DW_OP_piece 0x4, DW_OP_regx SGPR31_LO16, DW_OP_piece 0x4 +; CHECK-NEXT: DW_CFA_nop +; CHECK-EMPTY: +; CHECK-NEXT: 0x0: CFA=SGPR32_LO16 as6: PC_REG=[DW_OP_regx SGPR30_LO16, DW_OP_piece 0x4, DW_OP_regx SGPR31_LO16, DW_OP_piece 0x4] ; CHECK-EMPTY: ; CHECK: .eh_frame contents: ; CHECK-NOT: CIE