Index: llvm/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.h +++ llvm/lib/Target/AMDGPU/AMDGPU.h @@ -55,6 +55,7 @@ FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); +FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &, const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); @@ -225,6 +226,9 @@ void initializeSIFormMemoryClausesPass(PassRegistry&); extern char &SIFormMemoryClausesID; +void initializeSIPostRABundlerPass(PassRegistry&); +extern char &SIPostRABundlerID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -754,53 +754,6 @@ } namespace { -struct MemOpClusterMutation : ScheduleDAGMutation { - const SIInstrInfo *TII; - - MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} - - void apply(ScheduleDAGInstrs *DAG) override { - SUnit *SUa = nullptr; - // Search for two consequent memory operations and link them - // to prevent scheduler from moving them apart. - // In DAG pre-process SUnits are in the original order of - // the instructions before scheduling. - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MI2 = *SU.getInstr(); - if (!MI2.mayLoad() && !MI2.mayStore()) { - SUa = nullptr; - continue; - } - if (!SUa) { - SUa = &SU; - continue; - } - - MachineInstr &MI1 = *SUa->getInstr(); - if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || - (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || - (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || - (TII->isDS(MI1) && TII->isDS(MI2))) { - SU.addPredBarrier(SUa); - - for (const SDep &SI : SU.Preds) { - if (SI.getSUnit() != SUa) - SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); - } - - if (&SU != &DAG->ExitSU) { - for (const SDep &SI : SUa->Succs) { - if (SI.getSUnit() != &SU) - SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); - } - } - } - - SUa = &SU; - } - } -}; - struct FillMFMAShadowMutation : ScheduleDAGMutation { const SIInstrInfo *TII; @@ -927,7 +880,6 @@ void GCNSubtarget::getPostRAMutations( std::vector> &Mutations) const { - Mutations.push_back(std::make_unique(&InstrInfo)); Mutations.push_back(std::make_unique(&InstrInfo)); } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -234,6 +234,7 @@ initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); + initializeSIPostRABundlerPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -973,6 +974,7 @@ } void GCNPassConfig::addPreSched2() { + addPass(&SIPostRABundlerID); } void GCNPassConfig::addPreEmitPass() { Index: llvm/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -115,6 +115,7 @@ SIOptimizeExecMasking.cpp SIOptimizeExecMaskingPreRA.cpp SIPeepholeSDWA.cpp + SIPostRABundler.cpp SIRegisterInfo.cpp SIRemoveShortExecBranches.cpp SIShrinkInstructions.cpp Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1549,22 +1549,6 @@ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case TargetOpcode::BUNDLE: { - if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) - return false; - - // If it is a load it must be a memory clause - for (MachineBasicBlock::instr_iterator I = MI.getIterator(); - I->isBundledWithSucc(); ++I) { - I->unbundleFromSucc(); - for (MachineOperand &MO : I->operands()) - if (MO.isReg()) - MO.setIsInternalRead(false); - } - - MI.eraseFromParent(); - break; - } } return true; } Index: llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1289,6 +1289,21 @@ for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + + if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) { + MachineBasicBlock::instr_iterator II(MI->getIterator()); + for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); + I != E && I->isBundledWithPred(); ++I) { + I->unbundleFromPred(); + for (MachineOperand &MO : I->operands()) + if (MO.isReg()) + MO.setIsInternalRead(false); + } + + MI->eraseFromParent(); + MI = II->getIterator(); + } + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; Index: llvm/lib/Target/AMDGPU/SIPostRABundler.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -0,0 +1,138 @@ +//===-- SIPostRABundler.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass creates bundles of memory instructions to protect adjacent loads +/// and stores from beeing rescheduled apart from each other post-RA. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-post-ra-bundler" + +namespace { + +class SIPostRABundler : public MachineFunctionPass { +public: + static char ID; + +public: + SIPostRABundler() : MachineFunctionPass(ID) { + initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI post-RA bundler"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + const SIRegisterInfo *TRI; + + SmallSet Defs; + + bool isDependentLoad(const MachineInstr &MI) const; + +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false) + +char SIPostRABundler::ID = 0; + +char &llvm::SIPostRABundlerID = SIPostRABundler::ID; + +FunctionPass *llvm::createSIPostRABundlerPass() { + return new SIPostRABundler(); +} + +bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const { + if (!MI.mayLoad()) + return false; + + for (const MachineOperand &Op : MI.explicit_uses()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + for (const Register Def : Defs) + if (TRI->regsOverlap(Reg, Def)) + return true; + } + + return false; +} + +bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + TRI = MF.getSubtarget().getRegisterInfo(); + bool Changed = false; + const unsigned MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | + SIInstrFlags::SMRD | SIInstrFlags::DS | + SIInstrFlags::FLAT | SIInstrFlags::MIMG; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::instr_iterator Next; + MachineBasicBlock::instr_iterator B = MBB.instr_begin(); + MachineBasicBlock::instr_iterator E = MBB.instr_end(); + for (auto I = B; I != E; I = Next) { + Next = std::next(I); + + if (I->isBundled() || !I->mayLoadOrStore() || + B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() || + (B->getDesc().TSFlags & MemFlags) != + (I->getDesc().TSFlags & MemFlags) || + isDependentLoad(*I)) { + + if (B != I) { + if (std::next(B) != I) { + finalizeBundle(MBB, B, I); + Changed = true; + } + Next = I; + } + + B = Next; + Defs.clear(); + continue; + } + + if (I->getNumExplicitDefs() == 0) + continue; + + Defs.insert(I->defs().begin()->getReg()); + } + + if (B != E && std::next(B) != E) { + finalizeBundle(MBB, B, E); + Changed = true; + } + + Defs.clear(); + } + + return Changed; +} Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -275,8 +275,8 @@ ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 @@ -1832,8 +1832,8 @@ ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 Index: llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -263,13 +263,15 @@ ; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} ; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 ; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 ; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 +; GCN-NOT: s_add_u32 s32, s32, 0x800 +; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} + ; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 ; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 Index: llvm/test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -668,8 +668,6 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32: -; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}} - ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 ; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8 @@ -678,18 +676,17 @@ ; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8 ; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12 -; GCN-NOT: s_add_u32 [[SP]], - ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 -; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} -; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 - - ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 +; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}} + +; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} +; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 + ; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}} ; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4 Index: llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -400,7 +400,7 @@ ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s34{{$}} +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} Index: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -114,10 +114,10 @@ ; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -138,11 +138,11 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_mov_b32 s8, s4 ; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 @@ -173,14 +173,14 @@ ; SI-NEXT: s_mov_b32 s18, s14 ; SI-NEXT: s_mov_b32 s19, s15 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 ; SI-NEXT: s_mov_b32 s2, s14 ; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s6, s14 +; SI-NEXT: s_mov_b32 s7, s15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 @@ -200,16 +200,16 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 Index: llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -274,14 +274,14 @@ ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_movk_i32 s12, 0xff ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v1 @@ -293,7 +293,6 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6 ; SI-NEXT: v_and_b32_e32 v7, s12, v7 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v6, v7 @@ -315,6 +314,7 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -324,7 +324,6 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: v_mov_b32_e32 v6, s8 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5 @@ -391,11 +390,11 @@ ; SI-NEXT: v_or_b32_e32 v4, v3, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 @@ -416,6 +415,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 @@ -424,16 +424,15 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 @@ -446,11 +445,11 @@ ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 Index: llvm/test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2st64.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -63,7 +63,7 @@ ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} -; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] +; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { @@ -91,7 +91,7 @@ ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} -; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] +; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] ; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { Index: llvm/test/CodeGen/AMDGPU/global-saddr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -4,8 +4,8 @@ ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} Index: llvm/test/CodeGen/AMDGPU/half.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/half.ll +++ llvm/test/CodeGen/AMDGPU/half.ll @@ -307,7 +307,6 @@ ; GCN: flat_load_dwordx4 ; GCN: flat_load_dwordx4 -; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 @@ -316,6 +315,7 @@ ; GCN: flat_store_dwordx4 +; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 Index: llvm/test/CodeGen/AMDGPU/idot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot2.ll +++ llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2552,15 +2552,15 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s6, s4, 16 -; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2674,16 +2674,16 @@ ; GFX7-LABEL: notsdot2_sext8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s10, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_mov_b32 s6, s2 ; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -2704,11 +2704,11 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) @@ -2731,11 +2731,11 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2759,11 +2759,11 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2788,13 +2788,13 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[2:3], off ; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/idot4s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4s.ll +++ llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -183,16 +183,16 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_sext_i32_i8 s6, s4 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 ; GFX7-NEXT: s_sext_i32_i8 s7, s5 ; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008 ; GFX7-NEXT: s_and_b32 s7, s7, s8 ; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 ; GFX7-NEXT: s_and_b32 s10, s10, s8 ; GFX7-NEXT: s_and_b32 s6, s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -357,19 +357,19 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 ; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 +; GFX7-NEXT: s_and_b32 s5, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 @@ -899,16 +899,16 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_ashr_i32 s6, s4, 24 -; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 ; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010 ; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008 ; GFX7-NEXT: s_ashr_i32 s9, s5, 24 ; GFX7-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 ; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 ; GFX7-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -1048,9 +1048,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 Index: llvm/test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4u.ll +++ llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -184,19 +184,19 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 ; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 +; GFX7-NEXT: s_and_b32 s5, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 @@ -354,19 +354,19 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 ; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 +; GFX7-NEXT: s_and_b32 s5, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 @@ -518,15 +518,15 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_and_b32 s7, s4, s8 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX7-NEXT: s_and_b32 s6, s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -615,9 +615,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 ; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 @@ -658,14 +658,14 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_and_b32 s6, s4, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_and_b32 s7, s5, s8 ; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 ; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 @@ -821,16 +821,16 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_and_b32 s6, s4, s8 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 ; GFX7-NEXT: s_and_b32 s7, s5, s8 ; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 ; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 @@ -950,9 +950,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -1548,9 +1548,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 @@ -1940,9 +1940,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2002,17 +2002,17 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 ; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 ; GFX7-NEXT: s_lshr_b32 s11, s5, 16 ; GFX7-NEXT: s_lshr_b32 s12, s5, 24 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_lshr_b32 s7, s4, 16 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: s_lshr_b32 s9, s4, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 ; GFX7-NEXT: s_mul_i32 s4, s4, s5 @@ -2160,9 +2160,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -260,15 +260,15 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 ; GFX7-NEXT: s_and_b32 s9, s9, s0 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 ; GFX7-NEXT: s_and_b32 s11, s11, s0 ; GFX7-NEXT: s_and_b32 s8, s8, s0 @@ -478,9 +478,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 @@ -594,15 +594,15 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_movk_i32 s0, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 ; GFX7-NEXT: s_and_b32 s9, s9, s0 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 ; GFX7-NEXT: s_and_b32 s11, s11, s0 ; GFX7-NEXT: s_and_b32 s8, s8, s0 @@ -821,9 +821,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 @@ -1844,9 +1844,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 @@ -1945,14 +1945,13 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s8, s[10:11], 0x0 -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004 ; GFX7-NEXT: s_bfe_i32 s16, s8, 0x40000 ; GFX7-NEXT: s_bfe_i32 s17, s8, 0x40004 ; GFX7-NEXT: s_bfe_i32 s18, s8, 0x40008 @@ -1961,8 +1960,9 @@ ; GFX7-NEXT: s_bfe_i32 s21, s8, 0x40014 ; GFX7-NEXT: s_bfe_i32 s22, s8, 0x40018 ; GFX7-NEXT: s_ashr_i32 s8, s8, 28 -; GFX7-NEXT: v_mov_b32_e32 v7, s17 ; GFX7-NEXT: v_mov_b32_e32 v8, s16 +; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v7, s17 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: s_bfe_i32 s12, s2, 0x4000c @@ -2292,9 +2292,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -14,11 +14,10 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 ; GFX7-NEXT: s_lshr_b32 s11, s10, 28 ; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 @@ -27,6 +26,7 @@ ; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 ; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 ; GFX7-NEXT: s_and_b32 s10, s10, 15 +; GFX7-NEXT: s_lshr_b32 s1, s0, 28 ; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 @@ -59,11 +59,10 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 ; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 @@ -72,6 +71,7 @@ ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 +; GFX8-NEXT: s_lshr_b32 s4, s2, 28 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -106,11 +106,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 @@ -119,6 +118,7 @@ ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -259,12 +259,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -273,6 +272,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -451,9 +451,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -561,12 +561,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -575,6 +574,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -753,9 +753,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -863,12 +863,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -877,6 +876,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -1065,9 +1065,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -1162,12 +1162,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -1176,6 +1175,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -1364,9 +1364,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -1459,11 +1459,10 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 ; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 ; GFX7-NEXT: s_lshr_b32 s11, s10, 28 ; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 @@ -1472,6 +1471,7 @@ ; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c ; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 ; GFX7-NEXT: s_and_b32 s10, s10, 15 +; GFX7-NEXT: s_lshr_b32 s1, s0, 28 ; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 @@ -1506,11 +1506,10 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 ; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -1519,6 +1518,7 @@ ; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX8-NEXT: s_and_b32 s6, s6, 15 +; GFX8-NEXT: s_lshr_b32 s4, s2, 28 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -1555,11 +1555,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -1568,6 +1567,7 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -1604,11 +1604,10 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-DL-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -1617,6 +1616,7 @@ ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -1767,11 +1767,10 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 ; GFX7-NEXT: s_lshr_b32 s11, s10, 28 ; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 @@ -1780,6 +1779,7 @@ ; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 ; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 ; GFX7-NEXT: s_and_b32 s10, s10, 15 +; GFX7-NEXT: s_lshr_b32 s1, s0, 28 ; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 @@ -1812,11 +1812,10 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 ; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 @@ -1825,6 +1824,7 @@ ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 +; GFX8-NEXT: s_lshr_b32 s4, s2, 28 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -1859,11 +1859,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 @@ -1872,6 +1871,7 @@ ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 ; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 @@ -1977,15 +1977,13 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40004 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c ; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40004 ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 @@ -1993,6 +1991,8 @@ ; GFX7-NEXT: s_and_b32 s19, s1, 15 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 @@ -2193,9 +2193,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 @@ -2278,21 +2278,21 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX7-NEXT: s_bfe_u32 s14, s1, 0x4000c ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s18, s1, 28 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_mov_b32_e32 v8, s14 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40008 ; GFX7-NEXT: s_and_b32 s17, s1, 15 ; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: s_lshr_b32 s11, s0, 28 ; GFX7-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 @@ -2555,9 +2555,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 @@ -2650,12 +2650,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -2664,6 +2663,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -2852,9 +2852,9 @@ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -16,9 +16,9 @@ ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x40a00000 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 @@ -31,9 +31,9 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, 0x40a00000 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -156,9 +156,9 @@ ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_movk_i32 s4, 0x3e7 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 @@ -171,9 +171,9 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0x3e7 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -505,9 +505,9 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 @@ -538,9 +538,9 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -690,8 +690,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 @@ -1628,6 +1628,10 @@ ; SI-NEXT: s_and_b32 s4, s4, 7 ; SI-NEXT: s_lshl_b32 s4, s4, 3 ; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 ; SI-NEXT: v_mov_b32_e32 v4, s16 @@ -1638,10 +1642,6 @@ ; SI-NEXT: v_mov_b32_e32 v9, s21 ; SI-NEXT: v_mov_b32_e32 v10, s22 ; SI-NEXT: v_mov_b32_e32 v11, s23 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 @@ -1674,6 +1674,10 @@ ; VI-NEXT: s_and_b32 s4, s4, 7 ; VI-NEXT: s_lshl_b32 s4, s4, 3 ; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 ; VI-NEXT: v_mov_b32_e32 v4, s16 @@ -1684,10 +1688,6 @@ ; VI-NEXT: v_mov_b32_e32 v9, s21 ; VI-NEXT: v_mov_b32_e32 v10, s22 ; VI-NEXT: v_mov_b32_e32 v11, s23 -; VI-NEXT: v_mov_b32_e32 v12, s24 -; VI-NEXT: v_mov_b32_e32 v13, s25 -; VI-NEXT: v_mov_b32_e32 v14, s26 -; VI-NEXT: v_mov_b32_e32 v15, s27 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -579,8 +579,8 @@ ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -599,8 +599,8 @@ ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -619,8 +619,8 @@ ; CI-LABEL: v_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1165,8 +1165,8 @@ ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -1186,8 +1186,8 @@ ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1207,8 +1207,8 @@ ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1330,8 +1330,8 @@ ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1349,8 +1349,8 @@ ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1369,8 +1369,8 @@ ; CI-LABEL: v_insertelement_v4f16_0: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1401,8 +1401,8 @@ ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -1420,8 +1420,8 @@ ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1440,8 +1440,8 @@ ; CI-LABEL: v_insertelement_v4f16_1: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1472,8 +1472,8 @@ ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1491,8 +1491,8 @@ ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1511,8 +1511,8 @@ ; CI-LABEL: v_insertelement_v4f16_2: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1543,8 +1543,8 @@ ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -1562,8 +1562,8 @@ ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1582,8 +1582,8 @@ ; CI-LABEL: v_insertelement_v4f16_3: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1614,8 +1614,8 @@ ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1633,8 +1633,8 @@ ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1653,8 +1653,8 @@ ; CI-LABEL: v_insertelement_v4i16_2: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1686,9 +1686,9 @@ ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1712,10 +1712,10 @@ ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1723,16 +1723,15 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 s5, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s6, s4 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b32 s0, s1, 16 ; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1741,27 +1740,26 @@ ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s6, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_load_dword s6, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_mov_b32 s5, 0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b32 s2, s6, 16 ; CI-NEXT: s_and_b32 s3, s6, s4 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_or_b32 s1, s3, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_bfi_b32 v1, v5, s1, v1 ; CI-NEXT: v_bfi_b32 v0, v4, s1, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1783,8 +1781,8 @@ ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,8 +1807,8 @@ ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_mov_b32 s6, 0xffff ; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1837,8 +1835,8 @@ ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_mov_b32 s6, 0xffff ; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) Index: llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -15,13 +15,13 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; SI-NEXT: s_mov_b32 s8, s4 @@ -49,9 +49,9 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -74,9 +74,9 @@ ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -460,9 +460,9 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s6, 16 ; SI-NEXT: s_lshr_b32 s4, s8, 16 @@ -478,9 +478,9 @@ ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 Index: llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -15,13 +15,13 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; SI-NEXT: s_mov_b32 s8, s4 @@ -49,9 +49,9 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -74,9 +74,9 @@ ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -513,9 +513,9 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s6, 16 ; SI-NEXT: s_lshr_b32 s4, s8, 16 @@ -531,9 +531,9 @@ ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 Index: llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -427,9 +427,9 @@ ; CI-NEXT: v_mov_b32_e32 v13, s13 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; CI-NEXT: v_bfi_b32 v12, s2, v12, v13 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc ; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] ; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 Index: llvm/test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -545,8 +545,8 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 -; GFX803-NEXT: v_mov_b32_e32 v2, 0 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: v_mov_b32_e32 v2, 0 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 @@ -594,10 +594,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_write_b16 v3, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v2 @@ -618,8 +618,8 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: ds_write_b16 v2, v0 ; GFX900-NEXT: ds_write_b16 v3, v5 @@ -632,8 +632,8 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX906-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 ; GFX906-NEXT: ds_write_b16 v3, v5 Index: llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -42,10 +42,10 @@ ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] ; GCN: s_barrier +; SI: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 Index: llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -145,8 +145,8 @@ ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 @@ -163,8 +163,8 @@ ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 @@ -220,8 +220,8 @@ ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 @@ -238,8 +238,8 @@ ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -885,7 +885,8 @@ ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX89: s_waitcnt lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0){{$}} ; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GFX10WGP-NEXT: buffer_gl0_inv @@ -912,7 +913,8 @@ ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX89: s_waitcnt lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0){{$}} ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GFX10WGP-NEXT: buffer_gl0_inv Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,10 +5,12 @@ ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v17, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_dwordx4 v[0:3], v[16:17], s[2:3] ; GCN-NEXT: global_load_dwordx4 v[4:7], v[16:17], s[2:3] offset:16 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[16:17], s[2:3] offset:32 @@ -64,6 +66,7 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v13, s19 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -71,15 +74,14 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off ; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 ; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 @@ -113,7 +115,6 @@ ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 -; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], s33 offen @@ -131,6 +132,7 @@ ; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], s33 offen offset:48 ; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], s33 offen offset:52 ; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], s33 offen offset:56 +; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen offset:60 @@ -198,10 +200,12 @@ ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], s[2:3] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/merge-store-crash.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-store-crash.ll +++ llvm/test/CodeGen/AMDGPU/merge-store-crash.ll @@ -7,9 +7,9 @@ @tess_lds = external addrspace(3) global [8192 x i32] ; CHECK-LABEL: {{^}}main: -; CHECK: ds_write_b32 -; CHECK: ds_write_b32 -; CHECK: v_mov_b32_e32 v1, v0 +; CHECK-DAG: ds_write_b32 +; CHECK-DAG: ds_write_b32 +; CHECK-DAG: v_mov_b32_e32 v1, v0 ; CHECK: tbuffer_store_format_xyzw v[0:3], define amdgpu_vs void @main(i32 inreg %arg) { main_body: Index: llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -0,0 +1,108 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-post-ra-bundler %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: bundle_memops +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: bundle_memops + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: S_NOP 0 + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit undef $vgpr3_vgpr4, implicit $exec { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: S_NOP 0 + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr5, implicit undef $vgpr0_vgpr1, implicit $exec, implicit undef $vgpr3_vgpr4 { + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + ; GCN: $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 { + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: S_NOP 0 + ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 { + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: S_NOP 0 + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: S_NOP 0 + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $exec, implicit $vgpr1 { + ; GCN: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec + ; GCN: $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec + ; GCN: } + ; GCN: BUNDLE implicit $vgpr0, implicit $vgpr2, implicit killed $m0, implicit $exec, implicit $vgpr3 { + ; GCN: DS_WRITE_B32_gfx9 $vgpr0, $vgpr2, 0, 0, implicit killed $m0, implicit $exec + ; GCN: DS_WRITE_B32_gfx9 $vgpr0, $vgpr3, 4, 0, implicit killed $m0, implicit $exec + ; GCN: } + ; GCN: S_NOP 0 + ; GCN: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit undef $sgpr0_sgpr1, implicit undef $sgpr10 { + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0 + ; GCN: $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0 + ; GCN: } + ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr2, implicit $exec, implicit $vgpr1 { + ; GCN: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: BUNDLE implicit $vgpr0, implicit $vgpr2_vgpr3, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec { + ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: S_NOP 0 + ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $exec, implicit $vgpr1 { + ; GCN: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec + ; GCN: $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + S_NOP 0 + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + S_NOP 0 + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + S_NOP 0 + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + S_NOP 0 + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + S_NOP 0 + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec + $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec + DS_WRITE_B32_gfx9 $vgpr0, $vgpr2, 0, 0, implicit killed $m0, implicit $exec + DS_WRITE_B32_gfx9 $vgpr0, $vgpr3, 4, 0, implicit killed $m0, implicit $exec + S_NOP 0 + $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0 + $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0 + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + S_NOP 0 + $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec + $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -86,6 +86,7 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -94,7 +95,6 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; @@ -454,14 +454,14 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} Index: llvm/test/CodeGen/AMDGPU/saddo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/saddo.ll +++ llvm/test/CodeGen/AMDGPU/saddo.ll @@ -191,9 +191,9 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_load_dword v6, v[6:7] ; VI-NEXT: flat_load_dword v4, v[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -215,9 +215,9 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: global_load_dword v6, v[6:7], off ; GFX9-NEXT: global_load_dword v4, v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v2, s0 Index: llvm/test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -170,9 +170,8 @@ ; CI. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: -; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} -; GCN-NOHSA-NOT: v_add -; CI-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} +; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} +; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; CI-NOHSA-NOT: v_add ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -101,10 +101,10 @@ ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GCN: s_mov_b32 s2, s5 +; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -120,10 +120,10 @@ ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GCN: s_mov_b32 s2, s5 +; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx Index: llvm/test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select.f16.ll +++ llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -11,17 +11,17 @@ ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s22, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s10 -; SI-NEXT: s_mov_b32 s17, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: s_mov_b32 s20, s6 ; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: s_mov_b32 s23, s15 +; SI-NEXT: s_mov_b32 s16, s10 +; SI-NEXT: s_mov_b32 s17, s11 ; SI-NEXT: s_mov_b32 s2, s14 ; SI-NEXT: s_mov_b32 s3, s15 ; SI-NEXT: s_mov_b32 s18, s14 ; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_ushort v0, off, s[20:23], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; SI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 @@ -52,17 +52,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s16, s10 -; VI-NEXT: s_mov_b32 s17, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s16, s10 +; VI-NEXT: s_mov_b32 s17, s11 ; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 @@ -248,14 +248,14 @@ ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 @@ -283,14 +283,14 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s0, s2 ; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 @@ -324,14 +324,14 @@ ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 @@ -359,14 +359,14 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s0, s2 ; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 @@ -400,18 +400,18 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s10 -; SI-NEXT: s_mov_b32 s17, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s20, s6 ; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s16, s10 +; SI-NEXT: s_mov_b32 s17, s11 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 @@ -419,8 +419,6 @@ ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -428,7 +426,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -454,18 +454,18 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s16, s10 -; VI-NEXT: s_mov_b32 s17, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s16, s10 +; VI-NEXT: s_mov_b32 s17, s11 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v3, off, s[16:19], 0 @@ -506,15 +506,15 @@ ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s2 -; SI-NEXT: s_mov_b32 s17, s3 ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s16, s2 +; SI-NEXT: s_mov_b32 s17, s3 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 @@ -556,12 +556,12 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s0, s2 ; VI-NEXT: s_mov_b32 s1, s3 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 @@ -601,15 +601,15 @@ ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s2 -; SI-NEXT: s_mov_b32 s17, s3 ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s16, s2 +; SI-NEXT: s_mov_b32 s17, s3 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 @@ -651,12 +651,12 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s0, s2 ; VI-NEXT: s_mov_b32 s1, s3 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 @@ -705,25 +705,24 @@ ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc @@ -755,16 +754,15 @@ ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3800 ; VI-NEXT: v_mov_b32_e32 v3, 0x3900 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 @@ -802,25 +800,25 @@ ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc @@ -852,16 +850,15 @@ ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3800 ; VI-NEXT: v_mov_b32_e32 v3, 0x3900 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 Index: llvm/test/CodeGen/AMDGPU/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl.ll +++ llvm/test/CodeGen/AMDGPU/shl.ll @@ -163,9 +163,9 @@ ; GCN-LABEL: shl_i16_v_s: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 @@ -218,9 +218,9 @@ ; GCN-LABEL: shl_i16_v_compute_s: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 Index: llvm/test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -146,8 +146,8 @@ ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 @@ -164,8 +164,8 @@ ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 @@ -221,8 +221,8 @@ ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 @@ -239,8 +239,8 @@ ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 Index: llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -125,7 +125,7 @@ ; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x4 ; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xc -; GCN: ds_write_b32 +; GCN-DAG: ds_write_b32 ; CI: buffer_store_dword ; GFX9: global_store_dword define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { @@ -255,21 +255,19 @@ ; CI: v_mov_b32 ; CI: v_mov_b32 -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} - ; CI: v_add_i32 ; CI: v_add_i32 +; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} - ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 ; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 Index: llvm/test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -556,9 +556,9 @@ ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 Index: llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -190,10 +190,10 @@ } ; GCN-LABEL: {{^}}s_min_max_v4i16: -; GFX9: v_pk_max_i16 -; GFX9: v_pk_min_i16 -; GFX9: v_pk_max_i16 -; GFX9: v_pk_min_i16 +; GFX9-DAG: v_pk_max_i16 +; GFX9-DAG: v_pk_min_i16 +; GFX9-DAG: v_pk_max_i16 +; GFX9-DAG: v_pk_min_i16 define amdgpu_kernel void @s_min_max_v4i16(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { %cond0 = icmp sgt <4 x i16> %val0, %val1 %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1 Index: llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -215,8 +215,8 @@ ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; CIVI-NEXT: ds_write_b16 v0, v1 ; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) Index: llvm/test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -352,10 +352,10 @@ } ; GCN-LABEL: {{^}}mac_v2f16_same_add: -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} Index: llvm/test/CodeGen/AMDGPU/v_madak_f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -41,9 +41,9 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -75,11 +75,11 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s10 ; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: s_mov_b32 s3, s15 ; SI-NEXT: s_mov_b32 s18, s14 ; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 @@ -112,11 +112,11 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0