diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -36,7 +36,10 @@ class MachineDominatorTree; class LegalizerInfo; struct LegalityQuery; +class RegisterBank; +class RegisterBankInfo; class TargetLowering; +class TargetRegisterInfo; struct PreferredTuple { LLT Ty; // The result type of the extend. @@ -54,6 +57,7 @@ struct PtrAddChain { int64_t Imm; Register Base; + const RegisterBank *Bank; }; struct RegisterImmPair { @@ -95,6 +99,8 @@ GISelKnownBits *KB; MachineDominatorTree *MDT; const LegalizerInfo *LI; + const RegisterBankInfo *RBI; + const TargetRegisterInfo *TRI; public: CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B, @@ -120,6 +126,18 @@ void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp, Register ToReg) const; + /// Get the register bank of \p Reg. + /// If Reg has not been assigned a register, a register class, + /// or a register bank, then this returns nullptr. + /// + /// \pre Reg.isValid() + const RegisterBank *getRegBank(Register Reg) const; + + /// Set the register bank of \p Reg. + /// Does nothing if the RegBank is null. + /// This is the counterpart to getRegBank. + void setRegBank(Register Reg, const RegisterBank *RegBank); + /// If \p MI is COPY, try to combine it. /// Returns true if MI changed. bool tryCombineCopy(MachineInstr &MI); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -46,8 +47,9 @@ MachineIRBuilder &B, GISelKnownBits *KB, MachineDominatorTree *MDT, const LegalizerInfo *LI) - : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), - KB(KB), MDT(MDT), LI(LI) { + : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB), + MDT(MDT), LI(LI), RBI(Builder.getMF().getSubtarget().getRegBankInfo()), + TRI(Builder.getMF().getSubtarget().getRegisterInfo()) { (void)this->KB; } @@ -143,6 +145,15 @@ Observer.changedInstr(*FromRegOp.getParent()); } +const RegisterBank *CombinerHelper::getRegBank(Register Reg) const { + return RBI->getRegBank(Reg, MRI, *TRI); +} + +void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) { + if (RegBank) + MRI.setRegBank(Reg, *RegBank); +} + bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { if (matchCombineCopy(MI)) { applyCombineCopy(MI); @@ -1407,7 +1418,6 @@ // Don't promote to an alignment that would require dynamic stack // realignment. - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->hasStackRealignment(MF)) while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) NewAlign = NewAlign / 2; @@ -1512,7 +1522,6 @@ // Don't promote to an alignment that would require dynamic stack // realignment. - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->hasStackRealignment(MF)) while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) NewAlign = NewAlign / 2; @@ -1710,7 +1719,7 @@ if (!MaybeImmVal) return false; - MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2); + MachineInstr *Add2Def = MRI.getVRegDef(Add2); if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD) return false; @@ -1751,6 +1760,7 @@ // Pass the combined immediate to the apply function. MatchInfo.Imm = AMNew.BaseOffs; MatchInfo.Base = Base; + MatchInfo.Bank = getRegBank(Imm2); return true; } @@ -1760,6 +1770,7 @@ MachineIRBuilder MIB(MI); LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg()); auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm); + setRegBank(NewOffset.getReg(0), MatchInfo.Bank); Observer.changingInstr(MI); MI.getOperand(1).setReg(MatchInfo.Base); MI.getOperand(2).setReg(NewOffset.getReg(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -83,7 +83,7 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,174 +8,191 @@ ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_and_b32_e32 v0, 63, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GCN-NEXT: v_add_u32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 63, v2 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -193,36 +210,39 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr @@ -234,173 +254,190 @@ ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_bfe_u32 v0, v2, 1, 6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_bfe_u32 v0, v2, 1, 6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_and_b32_e32 v1, 1, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -418,40 +455,43 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v11, 6, s33 -; GCN-NEXT: v_add_u32_e32 v11, 0x100, v11 -; GCN-NEXT: v_add_u32_e32 v0, v11, v0 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_lshrrev_b32_e64 v7, 6, s33 +; GCN-NEXT: v_add_u32_e32 v7, 0x100, v7 +; GCN-NEXT: v_add_u32_e32 v0, v7, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -464,174 +504,191 @@ ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_and_b32_e32 v0, 31, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 -; GCN-NEXT: v_add_u32_e32 v1, v2, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 31, v2 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 +; GCN-NEXT: v_add_u32_e32 v1, v2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -649,37 +706,40 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -4138,12 +4138,7 @@ ; GPRIDX-LABEL: v_extract_v64i32_37: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], 0x80 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s5 -; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GPRIDX-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v0, v5 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] @@ -4151,12 +4146,7 @@ ; MOVREL-LABEL: v_extract_v64i32_37: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) @@ -4167,12 +4157,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b64 s[4:5], 0x80 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,123 +6,96 @@ ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v68, 8, v0 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v0, v68 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v2 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v64, vcc, v2, v0 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_addc_co_u32_e32 v65, vcc, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v3, v1, vcc -; GCN-NEXT: global_load_dwordx4 v[44:47], v68, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[48:51], v68, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v68, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v68, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[60:63], v68, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[4:7], v[64:65], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[64:65], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[64:65], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[66:67], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[66:67], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[66:67], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[0:3], v68, s[0:1] offset:128 -; GCN-NEXT: global_load_dwordx4 v[16:19], v68, s[0:1] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GCN-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 +; GCN-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GCN-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 +; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 +; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 +; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v68, v[0:3], s[2:3] offset:128 -; GCN-NEXT: global_store_dwordx4 v68, v[4:7], s[2:3] offset:144 -; GCN-NEXT: global_store_dwordx4 v68, v[8:11], s[2:3] offset:160 -; GCN-NEXT: global_store_dwordx4 v68, v[12:15], s[2:3] offset:176 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: global_store_dwordx4 v68, v[16:19], s[2:3] offset:192 -; GCN-NEXT: global_store_dwordx4 v68, v[20:23], s[2:3] offset:208 -; GCN-NEXT: global_store_dwordx4 v68, v[24:27], s[2:3] offset:224 -; GCN-NEXT: global_store_dwordx4 v68, v[44:47], s[2:3] -; GCN-NEXT: global_store_dwordx4 v68, v[48:51], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v68, v[52:55], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v68, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v68, v[60:63], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v68, v[28:31], s[2:3] offset:240 -; GCN-NEXT: global_store_dwordx4 v68, v[32:35], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v68, v[36:39], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v68, v[40:43], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GCN-NEXT: s_endpgm ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 8, v0 -; GFX10-NEXT: s_movk_i32 s4, 0x80 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_movk_i32 s4, 0xc0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_dwordx4 v[32:35], v70, s[0:1] -; GFX10-NEXT: global_load_dwordx4 v[36:39], v70, s[0:1] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[40:43], v70, s[0:1] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[44:47], v70, s[0:1] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[48:51], v70, s[0:1] offset:64 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v70 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v64, vcc_lo, v0, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v66, vcc_lo, v0, v1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v68, vcc_lo, v0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo -; GFX10-NEXT: s_clause 0xa -; GFX10-NEXT: global_load_dwordx4 v[52:55], v[64:65], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[56:59], v[64:65], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[60:63], v[64:65], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[66:67], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[66:67], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[66:67], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v[68:69], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v[68:69], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v[68:69], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v70, s[0:1] offset:128 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v70, s[0:1] offset:192 -; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: s_clause 0xf +; GFX10-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GFX10-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_dwordx4 v70, v[0:3], s[2:3] offset:128 -; GFX10-NEXT: global_store_dwordx4 v70, v[4:7], s[2:3] offset:144 -; GFX10-NEXT: global_store_dwordx4 v70, v[8:11], s[2:3] offset:160 -; GFX10-NEXT: global_store_dwordx4 v70, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 +; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 +; GFX10-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:48 +; GFX10-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:64 +; GFX10-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:80 +; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 +; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v70, v[16:19], s[2:3] offset:192 -; GFX10-NEXT: global_store_dwordx4 v70, v[20:23], s[2:3] offset:208 -; GFX10-NEXT: global_store_dwordx4 v70, v[24:27], s[2:3] offset:224 -; GFX10-NEXT: global_store_dwordx4 v70, v[32:35], s[2:3] -; GFX10-NEXT: global_store_dwordx4 v70, v[36:39], s[2:3] offset:16 -; GFX10-NEXT: global_store_dwordx4 v70, v[40:43], s[2:3] offset:32 -; GFX10-NEXT: global_store_dwordx4 v70, v[44:47], s[2:3] offset:48 -; GFX10-NEXT: global_store_dwordx4 v70, v[48:51], s[2:3] offset:64 -; GFX10-NEXT: global_store_dwordx4 v70, v[52:55], s[2:3] offset:80 -; GFX10-NEXT: global_store_dwordx4 v70, v[56:59], s[2:3] offset:96 -; GFX10-NEXT: global_store_dwordx4 v70, v[60:63], s[2:3] offset:112 -; GFX10-NEXT: global_store_dwordx4 v70, v[28:31], s[2:3] offset:240 +; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id