diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -55,6 +55,12 @@ Register Base; }; +struct PtrAddChainPostRegbank { + int64_t Imm; + Register Base; + unsigned Bank; +}; + struct RegisterImmPair { Register Reg; int64_t Imm; @@ -241,6 +247,11 @@ bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo); bool applyPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo); + bool matchPtrAddImmedChainPostRegbank(MachineInstr &MI, + PtrAddChainPostRegbank &MatchInfo); + bool applyPtrAddImmedChainPostRegbank(MachineInstr &MI, + PtrAddChainPostRegbank &MatchInfo); + /// Fold (shift (shift base, x), y) -> (shift base (x+y)) bool matchShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo); bool applyShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -164,6 +164,13 @@ [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>; +def ptr_add_immed_matchdata_post_regbank : GIDefMatchData<"PtrAddChainPostRegbank">; +def ptr_add_immed_chain_post_regbank : GICombineRule< + (defs root:$d, ptr_add_immed_matchdata_post_regbank:$matchinfo), + (match (wip_match_opcode G_PTR_ADD):$d, + [{ return Helper.matchPtrAddImmedChainPostRegbank(*${d}, ${matchinfo}); }]), + (apply [{ Helper.applyPtrAddImmedChainPostRegbank(*${d}, ${matchinfo}); }])>; + // Fold shift (shift base x), y -> shift base, (x+y), if shifts are same def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">; def shift_immed_chain : GICombineRule< diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -1699,6 +1700,60 @@ return true; } +bool CombinerHelper::matchPtrAddImmedChainPostRegbank( + MachineInstr &MI, PtrAddChainPostRegbank &MatchInfo) { + // Same as matchPtrAddImmedChain, but also check that the RegBanks of the + // constant are the same. + if (MI.getOpcode() != TargetOpcode::G_PTR_ADD) + return false; + + Register Add2 = MI.getOperand(1).getReg(); + Register Imm1 = MI.getOperand(2).getReg(); + auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI); + if (!MaybeImmVal) + return false; + + MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2); + if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD) + return false; + + Register Base = Add2Def->getOperand(1).getReg(); + Register Imm2 = Add2Def->getOperand(2).getReg(); + auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI); + if (!MaybeImm2Val) + return false; + + auto &MF = *MI.getParent()->getParent(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + unsigned Bank = RBI.getRegBank(Imm1, MRI, TRI)->getID(); + if (Bank != RBI.getRegBank(Imm2, MRI, TRI)->getID()) + return false; + + // Pass the combined immediate to the apply function. + MatchInfo.Imm = (MaybeImmVal->Value + MaybeImm2Val->Value).getSExtValue(); + MatchInfo.Base = Base; + MatchInfo.Bank = Bank; + return true; +} + +bool CombinerHelper::applyPtrAddImmedChainPostRegbank( + MachineInstr &MI, PtrAddChainPostRegbank &MatchInfo) { + // Same as applyPtrAddImmedChain, but also sets the RegBank of the constant. + assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); + MachineIRBuilder MIB(MI); + LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg()); + auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm); + auto &MF = *MI.getParent()->getParent(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + MRI.setRegBank(NewOffset.getReg(0), RBI.getRegBank(MatchInfo.Bank)); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(MatchInfo.Base); + MI.getOperand(2).setReg(NewOffset.getReg(0)); + Observer.changedInstr(MI); + return true; +} + bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo) { // We're trying to match the following pattern with any of diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -83,7 +83,7 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain_post_regbank]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,190 +8,170 @@ ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_and_b32_e32 v0, 63, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GCN-NEXT: v_add_u32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 63, v2 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload @@ -209,14 +189,10 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -234,30 +210,39 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr @@ -269,190 +254,170 @@ ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 63, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GCN-NEXT: v_and_b32_e32 v0, 63, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_and_b32_e32 v1, 1, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload @@ -470,14 +435,10 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -495,34 +456,43 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v15, 6, s33 -; GCN-NEXT: v_add_u32_e32 v15, 0x100, v15 -; GCN-NEXT: v_add_u32_e32 v0, v15, v0 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_lshrrev_b32_e64 v7, 6, s33 +; GCN-NEXT: v_add_u32_e32 v7, 0x100, v7 +; GCN-NEXT: v_add_u32_e32 v0, v7, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -535,190 +505,170 @@ ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_and_b32_e32 v0, 31, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 -; GCN-NEXT: v_add_u32_e32 v1, v2, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 31, v2 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 +; GCN-NEXT: v_add_u32_e32 v1, v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload @@ -736,14 +686,10 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -761,31 +707,40 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -4140,13 +4140,7 @@ ; GPRIDX-LABEL: v_extract_v64i32_37: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_movk_i32 s4, 0x80 -; GPRIDX-NEXT: s_mov_b32 s5, 0 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s5 -; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GPRIDX-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v0, v5 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] @@ -4154,13 +4148,7 @@ ; MOVREL-LABEL: v_extract_v64i32_37: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_movk_i32 s4, 0x80 -; MOVREL-NEXT: s_mov_b32 s5, 0 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) @@ -4171,13 +4159,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x80 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,123 +6,96 @@ ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v68, 8, v0 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v0, v68 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v2 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v64, vcc, v2, v0 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_addc_co_u32_e32 v65, vcc, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v3, v1, vcc -; GCN-NEXT: global_load_dwordx4 v[44:47], v68, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[48:51], v68, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v68, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v68, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[60:63], v68, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[4:7], v[64:65], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[64:65], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[64:65], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[66:67], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[66:67], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[66:67], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[0:3], v68, s[0:1] offset:128 -; GCN-NEXT: global_load_dwordx4 v[16:19], v68, s[0:1] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GCN-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 +; GCN-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GCN-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 +; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 +; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 +; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v68, v[0:3], s[2:3] offset:128 -; GCN-NEXT: global_store_dwordx4 v68, v[4:7], s[2:3] offset:144 -; GCN-NEXT: global_store_dwordx4 v68, v[8:11], s[2:3] offset:160 -; GCN-NEXT: global_store_dwordx4 v68, v[12:15], s[2:3] offset:176 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: global_store_dwordx4 v68, v[16:19], s[2:3] offset:192 -; GCN-NEXT: global_store_dwordx4 v68, v[20:23], s[2:3] offset:208 -; GCN-NEXT: global_store_dwordx4 v68, v[24:27], s[2:3] offset:224 -; GCN-NEXT: global_store_dwordx4 v68, v[44:47], s[2:3] -; GCN-NEXT: global_store_dwordx4 v68, v[48:51], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v68, v[52:55], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v68, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v68, v[60:63], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v68, v[28:31], s[2:3] offset:240 -; GCN-NEXT: global_store_dwordx4 v68, v[32:35], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v68, v[36:39], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v68, v[40:43], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GCN-NEXT: s_endpgm ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 8, v0 -; GFX10-NEXT: s_movk_i32 s4, 0x80 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_movk_i32 s4, 0xc0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_dwordx4 v[32:35], v70, s[0:1] -; GFX10-NEXT: global_load_dwordx4 v[36:39], v70, s[0:1] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[40:43], v70, s[0:1] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[44:47], v70, s[0:1] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[48:51], v70, s[0:1] offset:64 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v70 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v64, vcc_lo, v0, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v66, vcc_lo, v0, v1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v68, vcc_lo, v0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo -; GFX10-NEXT: s_clause 0xa -; GFX10-NEXT: global_load_dwordx4 v[52:55], v[64:65], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[56:59], v[64:65], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[60:63], v[64:65], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[66:67], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[66:67], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[66:67], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v[68:69], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v[68:69], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v[68:69], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v70, s[0:1] offset:128 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v70, s[0:1] offset:192 -; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: s_clause 0xf +; GFX10-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GFX10-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_dwordx4 v70, v[0:3], s[2:3] offset:128 -; GFX10-NEXT: global_store_dwordx4 v70, v[4:7], s[2:3] offset:144 -; GFX10-NEXT: global_store_dwordx4 v70, v[8:11], s[2:3] offset:160 -; GFX10-NEXT: global_store_dwordx4 v70, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 +; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 +; GFX10-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:48 +; GFX10-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:64 +; GFX10-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:80 +; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 +; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v70, v[16:19], s[2:3] offset:192 -; GFX10-NEXT: global_store_dwordx4 v70, v[20:23], s[2:3] offset:208 -; GFX10-NEXT: global_store_dwordx4 v70, v[24:27], s[2:3] offset:224 -; GFX10-NEXT: global_store_dwordx4 v70, v[32:35], s[2:3] -; GFX10-NEXT: global_store_dwordx4 v70, v[36:39], s[2:3] offset:16 -; GFX10-NEXT: global_store_dwordx4 v70, v[40:43], s[2:3] offset:32 -; GFX10-NEXT: global_store_dwordx4 v70, v[44:47], s[2:3] offset:48 -; GFX10-NEXT: global_store_dwordx4 v70, v[48:51], s[2:3] offset:64 -; GFX10-NEXT: global_store_dwordx4 v70, v[52:55], s[2:3] offset:80 -; GFX10-NEXT: global_store_dwordx4 v70, v[56:59], s[2:3] offset:96 -; GFX10-NEXT: global_store_dwordx4 v70, v[60:63], s[2:3] offset:112 -; GFX10-NEXT: global_store_dwordx4 v70, v[28:31], s[2:3] offset:240 +; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id