diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h --- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -1130,6 +1130,9 @@ if (!TM.requiresStructuredCFG()) addPass(TailDuplicatePass()); + // Cleanup of redundant (identical) address/immediate loads. + addPass(MachineLateInstrsCleanupPass()); + // Copy propagation. addPass(MachineCopyPropagationPass()); } diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def --- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -151,6 +151,7 @@ DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ()) +DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass, ()) DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ()) DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ()) diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -334,6 +334,10 @@ MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr); + /// MachineLateInstrsCleanup - This pass removes redundant identical + /// instructions after register allocation and rematerialization. + extern char &MachineLateInstrsCleanupID; + /// PeepholeOptimizer - This pass performs peephole optimizations - /// like extension and comparison eliminations. extern char &PeepholeOptimizerID; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -277,6 +277,7 @@ void initializeMachineDominatorTreePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeMachineFunctionSplitterPass(PassRegistry &); +void initializeMachineLateInstrsCleanupPass(PassRegistry&); void initializeMachineLICMPass(PassRegistry&); void initializeMachineLoopInfoPass(PassRegistry&); void initializeMachineModuleInfoWrapperPassPass(PassRegistry &); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -119,6 +119,7 @@ MachineFunctionSplitter.cpp MachineInstrBundle.cpp MachineInstr.cpp + MachineLateInstrsCleanup.cpp MachineLICM.cpp MachineLoopInfo.cpp MachineLoopUtils.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -78,6 +78,7 @@ initializeMachineCycleInfoWrapperPassPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachineFunctionPrinterPassPass(Registry); + initializeMachineLateInstrsCleanupPass(Registry); initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoWrapperPassPass(Registry); diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp @@ -0,0 +1,239 @@ +//==--- MachineLateInstrsCleanup.cpp - Late Instructions Cleanup Pass -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This simple pass removes any identical and redundant immediate or address +// loads to the same register. The immediate loads removed can originally be +// the result of rematerialization, while the addresses are redundant frame +// addressing anchor points created during Frame Indices elimination. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-latecleanup" + +STATISTIC(NumRemoved, "Number of redundant instructions removed."); + +namespace { + +class MachineLateInstrsCleanup : public MachineFunctionPass { + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + + // Data structures to map regs to their definitions per MBB. + using Reg2DefMap = std::map; + std::vector RegDefs; + + // Walk through the instructions in MBB and remove any redundant + // instructions. + bool processBlock(MachineBasicBlock *MBB); + +public: + static char ID; // Pass identification, replacement for typeid + + MachineLateInstrsCleanup() : MachineFunctionPass(ID) { + initializeMachineLateInstrsCleanupPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } +}; + +} // end anonymous namespace + +char MachineLateInstrsCleanup::ID = 0; + +char &llvm::MachineLateInstrsCleanupID = MachineLateInstrsCleanup::ID; + +INITIALIZE_PASS(MachineLateInstrsCleanup, DEBUG_TYPE, + "Machine Late Instructions Cleanup Pass", false, false) + +bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + bool Changed = false; + + TRI = MF.getSubtarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + + RegDefs.clear(); + RegDefs.resize(MF.getNumBlockIDs()); + + // Visit all MBBs in an order that maximises the reuse from predecessors. + ReversePostOrderTraversal RPOT(&MF); + for (MachineBasicBlock *MBB : RPOT) + Changed |= processBlock(MBB); + + return Changed; +} + +// Clear any previous kill flag on Reg found before I in MBB. Walk backwards +// in MBB and if needed continue in predecessors until a use/def of Reg is +// encountered. This seems to be faster in practice than tracking kill flags +// in a map. +static void clearKillsForDef(Register Reg, MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + BitVector &VisitedPreds, + const TargetRegisterInfo *TRI) { + VisitedPreds.set(MBB->getNumber()); + while (I != MBB->begin()) { + I--; + bool Found = false; + for (auto &MO : I->operands()) + if (MO.isReg() && TRI->regsOverlap(MO.getReg(), Reg)) { + if (MO.isDef()) + return; + if (MO.readsReg()) { + MO.setIsKill(false); + Found = true; // Keep going for an implicit kill of the super-reg. + } + } + if (Found) + return; + } + + // If an earlier def is not in MBB, continue in predecessors. + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + assert(!MBB->pred_empty() && "Predecessor def not found!"); + for (MachineBasicBlock *Pred : MBB->predecessors()) + if (!VisitedPreds.test(Pred->getNumber())) + clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds, TRI); +} + +static void removeRedundantDef(MachineInstr *MI, + const TargetRegisterInfo *TRI) { + Register Reg = MI->getOperand(0).getReg(); + BitVector VisitedPreds(MI->getMF()->getNumBlockIDs()); + clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds, TRI); + MI->eraseFromParent(); + ++NumRemoved; +} + +// Return true if MI is a potential candidate for reuse/removal and if so +// also the register it defines in DefedReg. A candidate is a simple +// instruction that does not touch memory, has only one register definition +// and the only reg it may use is FrameReg. Typically this is an immediate +// load or a load-address instruction. +static bool isCandidate(const MachineInstr *MI, Register &DefedReg, + Register FrameReg) { + DefedReg = MCRegister::NoRegister; + bool SawStore = true; + if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() || + MI->isInlineAsm()) + return false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + if (MO.isDef()) { + if (i == 0 && !MO.isImplicit() && !MO.isDead()) + DefedReg = MO.getReg(); + else + return false; + } else if (MO.getReg() && MO.getReg() != FrameReg) + return false; + } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() || + MO.isGlobal() || MO.isSymbol())) + return false; + } + return DefedReg.isValid(); +} + +bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { + bool Changed = false; + + Reg2DefMap &MBBDefs = RegDefs[MBB->getNumber()]; + + // Find reusable definitions in the predecessor(s). + if (!MBB->pred_empty()) { + MachineBasicBlock *FirstPred = *MBB->pred_begin(); + for (auto [Reg, DefMI] : RegDefs[FirstPred->getNumber()]) + if (llvm::all_of(drop_begin(MBB->predecessors()), + [&](const MachineBasicBlock *Pred) { + auto PredDefI = RegDefs[Pred->getNumber()].find(Reg); + return PredDefI != RegDefs[Pred->getNumber()].end() && + DefMI->isIdenticalTo(*PredDefI->second); + })) { + MBBDefs[Reg] = DefMI; + LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in " + << printMBBReference(*MBB) << ": " << *DefMI;); + } + } + + // Process MBB. + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + Register FrameReg = TRI->getFrameRegister(*MF); + for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { + // If FrameReg is modified, no previous load-address instructions are valid. + if (MI.modifiesRegister(FrameReg, TRI)) { + MBBDefs.clear(); + continue; + } + + Register DefedReg; + bool IsCandidate = isCandidate(&MI, DefedReg, FrameReg); + + // Check for an earlier identical and reusable instruction. + if (IsCandidate) { + auto DefI = MBBDefs.find(DefedReg); + if (DefI != MBBDefs.end() && MI.isIdenticalTo(*DefI->second)) { + LLVM_DEBUG(dbgs() << "Removing redundant instruction in " + << printMBBReference(*MBB) << ": " << MI;); + removeRedundantDef(&MI, TRI); + Changed = true; + continue; + } + } + + // Clear any entries in map that MI clobbers. + for (auto DefI = MBBDefs.begin(); DefI != MBBDefs.end();) { + Register Reg = DefI->first; + if (MI.modifiesRegister(Reg, TRI)) + DefI = MBBDefs.erase(DefI); + else + ++DefI; + } + + // Record this MI for potential later reuse. + if (IsCandidate) { + LLVM_DEBUG(dbgs() << "Found interesting instruction in " + << printMBBReference(*MBB) << ": " << MI;); + MBBDefs[DefedReg] = &MI; + } + } + + return Changed; +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1522,6 +1522,9 @@ /// Add passes that optimize machine instructions after register allocation. void TargetPassConfig::addMachineLateOptimization() { + // Cleanup of redundant immediate/address loads. + addPass(&MachineLateInstrsCleanupID); + // Branch folding must be run after regalloc and prolog/epilog insertion. addPass(&BranchFolderPassID); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -290,6 +290,7 @@ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); + disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&TailDuplicateID); disablePass(&StackMapLivenessID); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -500,6 +500,7 @@ // them. // These functions all require the NoVRegs property. + disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&PostRAMachineSinkingID); disablePass(&PostRASchedulerID); diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -184,6 +184,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication diff --git a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll --- a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll @@ -29,14 +29,8 @@ ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x9, [x9] ; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: Lloh6: -; CHECK-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE ; CHECK-NEXT: stur x9, [x29, #-8] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF] ; CHECK-NEXT: ldur x9, [x29, #-8] -; CHECK-NEXT: Lloh8: -; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %entry @@ -46,7 +40,6 @@ ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %entry ; CHECK-NEXT: bl ___stack_chk_fail -; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh6, Lloh7, Lloh8 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4 entry: diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -59,26 +59,23 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w1, #1 ; CHECK-NEXT: mov w2, #2 -; CHECK-NEXT: st1d { z16.d }, p0, [x9] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w3, #3 ; CHECK-NEXT: mov w4, #4 ; CHECK-NEXT: mov w5, #5 ; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w7, #7 -; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1d { z19.d }, p0, [x9, #3, mul vl] ; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl callee2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -157,8 +157,6 @@ ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 ; FLATSCR-NEXT: s_mov_b32 s11, 0 ; FLATSCR-NEXT: s_mov_b32 s10, 0 ; FLATSCR-NEXT: s_mov_b32 s9, 0 @@ -171,9 +169,8 @@ ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_mov_b32 s3, 0 ; FLATSCR-NEXT: s_mov_b32 s2, 0 -; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: s_mov_b32 s40, 0 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:24 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:32 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:40 @@ -188,6 +185,7 @@ ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 offset:112 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:120 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:128 +; FLATSCR-NEXT: s_mov_b32 s40, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s40 offset:8 ; FLATSCR-NEXT: s_mov_b32 s39, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s39 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1354,7 +1354,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz .LBB13_2 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -537,7 +537,6 @@ ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND -; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -554,7 +553,6 @@ ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x40000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -569,8 +567,6 @@ ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill -; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload @@ -585,7 +581,6 @@ ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill -; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: ;;#ASMSTART ; GFX1100-NEXT: ;;#ASMEND ; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -76,12 +76,10 @@ ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: ; implicit-def: $sgpr16_sgpr17 ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -22,18 +22,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 vcc_lo, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_kernel: @@ -43,7 +37,6 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -55,15 +48,12 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v4 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_kernel: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -74,9 +64,6 @@ ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:36 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v4 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -100,16 +87,10 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 -; GFX9-PAL-NEXT: s_nop 0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_kernel: @@ -124,11 +105,6 @@ ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_kernel: @@ -153,15 +129,10 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:52 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:36 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:4 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v4 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_kernel: @@ -176,7 +147,6 @@ ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -188,15 +158,12 @@ ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v4 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_kernel: ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -207,15 +174,11 @@ ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v4 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) - call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -235,11 +198,6 @@ ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s32 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -248,7 +206,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s32 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -260,9 +217,6 @@ ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v4 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -271,7 +225,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s32 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -282,9 +236,6 @@ ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v4 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -303,11 +254,6 @@ ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX9-PAL-NEXT: s_nop 0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -324,11 +270,6 @@ ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_mov_b32_e32 v0, s32 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -337,7 +278,6 @@ ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -349,9 +289,6 @@ ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v4 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -360,7 +297,7 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v4, s32 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -371,15 +308,26 @@ ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v4 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: zero_init_foo: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) - call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -400,10 +348,6 @@ ; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_kernel: @@ -424,10 +368,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_kernel: @@ -444,10 +384,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: @@ -470,10 +406,6 @@ ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_kernel: @@ -490,10 +422,6 @@ ; GFX940-NEXT: s_add_i32 s0, s0, 4 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: @@ -519,10 +447,6 @@ ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: @@ -539,11 +463,22 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm +; GCN-LABEL: store_load_sindex_kernel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 15 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s1, s0, 2 +; GCN-NEXT: s_and_b32 s0, s0, 15 +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_add_u32 s1, 4, s1 +; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_add_u32 s0, 4, s0 +; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -554,7 +489,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -573,10 +507,6 @@ ; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_foo: @@ -595,10 +525,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_foo: @@ -613,10 +539,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_foo: @@ -638,10 +560,6 @@ ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_foo: @@ -656,10 +574,6 @@ ; GFX940-NEXT: s_add_i32 s0, s0, 4 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_foo: @@ -683,10 +597,6 @@ ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_foo: @@ -701,11 +611,20 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm +; GCN-LABEL: store_load_sindex_foo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_lshl_b32 s1, s0, 2 +; GCN-NEXT: s_and_b32 s0, s0, 15 +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_add_u32 s1, 4, s1 +; GCN-NEXT: v_mov_b32_e32 v0, 15 +; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_add_u32 s0, 4, s0 +; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -716,7 +635,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -733,10 +651,6 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_kernel: @@ -753,10 +667,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: @@ -768,10 +678,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: @@ -791,10 +697,6 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_kernel: @@ -806,10 +708,6 @@ ; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: @@ -831,10 +729,6 @@ ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: @@ -846,11 +740,17 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm +; GCN-LABEL: store_load_vindex_kernel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 15 +; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 +; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -863,7 +763,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -880,9 +779,6 @@ ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v1 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_foo: @@ -897,10 +793,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s32 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_foo: @@ -915,10 +807,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s32 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_foo: @@ -933,9 +821,6 @@ ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v1 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_foo: @@ -949,10 +834,6 @@ ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s32 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_foo: @@ -967,10 +848,6 @@ ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_foo: @@ -985,11 +862,19 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s32 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_vindex_foo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 15 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, v0, v2 +; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -1000,7 +885,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1064,6 +948,13 @@ ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: private_ptr_foo: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 store float 1.000000e+01, float addrspace(5)* %gep, align 4 ret void @@ -1086,22 +977,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 vcc_lo, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_small_offset_kernel: @@ -1113,7 +994,6 @@ ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1121,17 +1001,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v4 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v5 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_small_offset_kernel: @@ -1139,7 +1012,7 @@ ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -1150,12 +1023,6 @@ ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:276 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v4 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v5 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1182,20 +1049,10 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 -; GFX9-PAL-NEXT: s_nop 0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_small_offset_kernel: @@ -1212,15 +1069,6 @@ ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: @@ -1247,20 +1095,11 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:260 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:276 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:308 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v4 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v5 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: @@ -1277,7 +1116,6 @@ ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -1285,17 +1123,10 @@ ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v4 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v5 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_small_offset_kernel: @@ -1303,7 +1134,7 @@ ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -1314,12 +1145,6 @@ ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v4 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v5 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1328,8 +1153,6 @@ %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) - call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -1351,15 +1174,6 @@ ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 -; GFX9-NEXT: v_mov_b32_e32 v0, s32 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1370,7 +1184,6 @@ ; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1378,18 +1191,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s32 -; GFX10-NEXT: v_mov_b32_e32 v5, vcc_lo ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v4 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v5 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1400,24 +1205,17 @@ ; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v4 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v5 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1438,15 +1236,6 @@ ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1465,16 +1254,6 @@ ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, s32 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1485,7 +1264,6 @@ ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -1493,18 +1271,10 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32 -; GFX10-PAL-NEXT: v_mov_b32_e32 v5, vcc_lo ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v4 -; GFX10-PAL-NEXT: ;;#ASMEND -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v5 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1515,34 +1285,42 @@ ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v4 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v5 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: zero_init_small_offset_foo: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 +; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) - call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -1565,14 +1343,6 @@ ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: @@ -1585,7 +1355,6 @@ ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 @@ -1596,13 +1365,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: @@ -1610,7 +1372,7 @@ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 +; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -1621,13 +1383,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1653,14 +1408,6 @@ ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: @@ -1679,14 +1426,6 @@ ; GFX940-NEXT: s_addk_i32 s0, 0x104 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1702,7 +1441,6 @@ ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1716,13 +1454,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v1 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1740,7 +1471,6 @@ ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1751,13 +1481,6 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v1 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1765,7 +1488,7 @@ ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1776,13 +1499,6 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1797,8 +1513,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1820,14 +1534,6 @@ ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_foo: @@ -1848,21 +1554,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 +; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 @@ -1872,13 +1570,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1903,14 +1594,6 @@ ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_foo: @@ -1927,14 +1610,6 @@ ; GFX940-NEXT: s_addk_i32 s0, 0x104 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1961,14 +1636,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v1 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1994,21 +1661,13 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v1 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 @@ -2018,13 +1677,6 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -2039,8 +1691,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2060,14 +1710,6 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v1 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: @@ -2086,14 +1728,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: @@ -2104,16 +1738,8 @@ ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -2136,14 +1762,6 @@ ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v1 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_small_offset_kernel: @@ -2157,14 +1775,6 @@ ; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 -; GFX940-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v1 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -2189,14 +1799,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v1 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -2220,14 +1822,6 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v1 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -2238,16 +1832,8 @@ ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -2264,8 +1850,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2285,13 +1869,6 @@ ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s32 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v1 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_small_offset_foo: @@ -2299,26 +1876,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 s1, s32, 0x100 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s1 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0 +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo ; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s32 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_small_offset_foo: @@ -2326,7 +1894,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2335,13 +1902,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -2359,13 +1919,6 @@ ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v1 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_small_offset_foo: @@ -2381,15 +1934,6 @@ ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s32 -; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -2397,26 +1941,17 @@ ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x100 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v1 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -2424,7 +1959,6 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2433,14 +1967,21 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_vindex_small_offset_foo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 15 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, v0, v2 +; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -2454,8 +1995,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2483,15 +2022,6 @@ ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_large_offset_kernel: @@ -2514,18 +2044,10 @@ ; GFX10-NEXT: s_movk_i32 s2, 0x4004 ; GFX10-NEXT: s_movk_i32 s1, 0x4004 ; GFX10-NEXT: s_movk_i32 s0, 0x4004 -; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2 -; GFX10-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v4 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v5 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_large_offset_kernel: @@ -2542,18 +2064,11 @@ ; GFX11-NEXT: s_movk_i32 s2, 0x4004 ; GFX11-NEXT: s_movk_i32 s1, 0x4004 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 -; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v4 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v5 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2585,15 +2100,6 @@ ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 -; GFX9-PAL-NEXT: s_nop 0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_large_offset_kernel: @@ -2614,15 +2120,6 @@ ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: @@ -2652,17 +2149,9 @@ ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v4 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v5 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: @@ -2690,18 +2179,10 @@ ; GFX1030-PAL-NEXT: s_movk_i32 s2, 0x4004 ; GFX1030-PAL-NEXT: s_movk_i32 s1, 0x4004 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x4004 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v4 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v5 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_large_offset_kernel: @@ -2718,18 +2199,11 @@ ; GFX11-PAL-NEXT: s_movk_i32 s2, 0x4004 ; GFX11-PAL-NEXT: s_movk_i32 s1, 0x4004 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 -; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v4 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v5 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2738,8 +2212,6 @@ %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -2757,24 +2229,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_add_i32 s3, s32, 0x4004 -; GFX9-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX9-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s3 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2785,7 +2247,6 @@ ; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_add_i32 s4, s32, 0x4004 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -2793,23 +2254,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_add_i32 s3, s32, 4 ; GFX10-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX10-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v4 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v5 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2820,29 +2272,21 @@ ; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_add_i32 s4, s32, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_add_i32 s3, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v4 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v5 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2859,24 +2303,14 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-PAL-NEXT: s_add_i32 s3, s32, 0x4004 -; GFX9-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s3 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2891,25 +2325,14 @@ ; GFX940-NEXT: s_mov_b32 s3, s0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: s_add_i32 s3, s32, 0x4004 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX940-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s3 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 +; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2920,7 +2343,6 @@ ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_add_i32 s4, s32, 0x4004 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -2928,23 +2350,14 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-PAL-NEXT: s_add_i32 s3, s32, 4 ; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s4 -; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v4 -; GFX10-PAL-NEXT: ;;#ASMEND -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v5 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2955,29 +2368,21 @@ ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_add_i32 s4, s32, 4 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-PAL-NEXT: s_add_i32 s3, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v4 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v5 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2986,8 +2391,6 @@ %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -3010,14 +2413,6 @@ ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: @@ -3030,7 +2425,6 @@ ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 @@ -3041,13 +2435,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: @@ -3055,7 +2442,7 @@ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 +; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -3066,13 +2453,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -3098,14 +2478,6 @@ ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: @@ -3124,14 +2496,6 @@ ; GFX940-NEXT: s_addk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -3147,7 +2511,6 @@ ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -3161,13 +2524,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v1 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -3185,7 +2541,6 @@ ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -3196,13 +2551,6 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v1 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -3210,7 +2558,7 @@ ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -3221,13 +2569,6 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3242,8 +2583,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -3265,14 +2604,6 @@ ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_foo: @@ -3293,21 +2624,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 +; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 @@ -3317,13 +2640,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -3348,14 +2664,6 @@ ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_large_offset_foo: @@ -3372,14 +2680,6 @@ ; GFX940-NEXT: s_addk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -3406,14 +2706,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v1 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -3439,21 +2731,13 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v1 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 @@ -3463,13 +2747,6 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3484,8 +2761,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -3505,14 +2780,6 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v1 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: @@ -3531,14 +2798,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: @@ -3550,16 +2809,8 @@ ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -3582,14 +2833,6 @@ ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v1 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_large_offset_kernel: @@ -3604,14 +2847,6 @@ ; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 -; GFX940-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v1 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -3636,14 +2871,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v1 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -3667,14 +2894,6 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v1 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -3686,16 +2905,8 @@ ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3712,8 +2923,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -3723,8 +2932,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 @@ -3733,14 +2942,6 @@ ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 -; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v1 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_large_offset_foo: @@ -3748,27 +2949,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 s2, s32, 0x4004 -; GFX10-NEXT: s_add_i32 s1, s32, 0x4004 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s1 +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo ; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_add_i32 s0, s32, 4 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_large_offset_foo: @@ -3776,25 +2967,17 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: s_add_i32 s2, s32, 0x4004 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4004 -; GFX11-NEXT: s_add_i32 s0, s32, 4 -; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s2 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, s1 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3802,8 +2985,8 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 @@ -3812,14 +2995,6 @@ ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v1 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_large_offset_foo: @@ -3829,24 +3004,14 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s1 sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3854,27 +3019,17 @@ ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004 -; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_add_i32 s0, s32, 4 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v1 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3882,26 +3037,34 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004 -; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004 -; GFX11-PAL-NEXT: s_add_i32 s0, s32, 4 -; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s2 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s1 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_vindex_large_offset_foo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 15 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, v0, v2 +; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -3915,8 +3078,6 @@ %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 - call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -3936,10 +3097,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: @@ -3958,10 +3115,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: @@ -3974,10 +3127,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -4000,10 +3149,6 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_large_imm_offset_kernel: @@ -4017,10 +3162,6 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -4045,10 +3186,6 @@ ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1010-PAL-NEXT: ;;#ASMSTART -; GFX1010-PAL-NEXT: ; use v0 -; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -4072,10 +3209,6 @@ ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX1030-PAL-NEXT: ;;#ASMSTART -; GFX1030-PAL-NEXT: ; use v0 -; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -4088,10 +3221,6 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -4101,7 +3230,6 @@ store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0 ret void } @@ -4111,20 +3239,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 -; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_large_imm_offset_foo: @@ -4134,19 +3257,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-NEXT: s_add_i32 s1, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_large_imm_offset_foo: @@ -4155,17 +3273,12 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: @@ -4173,20 +3286,15 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_large_imm_offset_foo: @@ -4201,11 +3309,6 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 vcc_hi, s32, 4 -; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: @@ -4215,19 +3318,14 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-PAL-NEXT: s_add_i32 s1, s32, 4 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: @@ -4236,18 +3334,26 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_large_imm_offset_foo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 13 +; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 +; GCN-NEXT: v_mov_b32_e32 v1, 15 +; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -4256,7 +3362,6 @@ store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 - call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0 ret void } @@ -4267,17 +3372,14 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: scratch_store_dword v0, v2, off offset:1024 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v1 -; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vidx_sidx_offset: @@ -4295,10 +3397,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 -; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: @@ -4311,10 +3409,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: @@ -4324,20 +3418,17 @@ ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: scratch_store_dword v0, v2, off offset:1024 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v1 -; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vidx_sidx_offset: @@ -4350,10 +3441,6 @@ ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 4 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: @@ -4376,10 +3463,6 @@ ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: @@ -4392,11 +3475,18 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 -; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm +; GCN-LABEL: store_load_vidx_sidx_offset: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 15 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -4405,7 +3495,6 @@ %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 store volatile i32 15, i32 addrspace(5)* %gep, align 4 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 - call void asm sideeffect "; use $0", "s"([32 x i32] addrspace(5)* %alloca) #0 ret void } @@ -4488,6 +3577,16 @@ ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_i64_aligned: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 15 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 @@ -4573,6 +3672,16 @@ ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_i64_unaligned: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 15 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 @@ -4665,6 +3774,17 @@ ; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_v3i32_unaligned: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_mov_b32_e32 v4, 3 +; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 @@ -4762,6 +3882,18 @@ ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: store_load_v4i32_unaligned: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_mov_b32_e32 v4, 3 +; GCN-NEXT: v_mov_b32_e32 v5, 4 +; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -374,6 +374,7 @@ ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Shrink Wrapping analysis ; GCN-O1-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-NEXT: Control Flow Optimizer ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Tail Duplication @@ -670,6 +671,7 @@ ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Shrink Wrapping analysis ; GCN-O1-OPTS-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-OPTS-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-OPTS-NEXT: Control Flow Optimizer ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Tail Duplication @@ -968,6 +970,7 @@ ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Shrink Wrapping analysis ; GCN-O2-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O2-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O2-NEXT: Control Flow Optimizer ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Tail Duplication @@ -1279,6 +1282,7 @@ ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Shrink Wrapping analysis ; GCN-O3-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O3-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O3-NEXT: Control Flow Optimizer ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Tail Duplication diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -188,7 +188,6 @@ ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -187,8 +187,6 @@ ; SI-NEXT: s_branch .LBB3_3 ; SI-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_mov_b64 s[12:13], -1 -; SI-NEXT: s_mov_b64 s[14:15], -1 ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_and_b64 vcc, exec, s[14:15] @@ -206,7 +204,6 @@ ; SI-NEXT: s_cbranch_vccz .LBB3_1 ; SI-NEXT: ; %bb.5: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 s[14:15], -1 ; SI-NEXT: s_mov_b64 vcc, s[6:7] ; SI-NEXT: s_cbranch_vccz .LBB3_7 ; SI-NEXT: ; %bb.6: ; %if.else @@ -263,8 +260,6 @@ ; FLAT-NEXT: s_branch .LBB3_3 ; FLAT-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_mov_b64 s[8:9], 0 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 -; FLAT-NEXT: s_mov_b64 s[14:15], -1 ; FLAT-NEXT: .LBB3_2: ; %Flow ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] @@ -282,7 +277,6 @@ ; FLAT-NEXT: s_cbranch_vccz .LBB3_1 ; FLAT-NEXT: ; %bb.5: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 s[14:15], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[6:7] ; FLAT-NEXT: s_cbranch_vccz .LBB3_7 ; FLAT-NEXT: ; %bb.6: ; %if.else diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -60,7 +60,6 @@ ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: .LBB0_4: ; %Flow3 ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec @@ -103,7 +102,6 @@ ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 -; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -140,7 +140,6 @@ ; GCN-NEXT: s_cbranch_scc1 .LBB0_10 ; GCN-NEXT: ; %bb.9: ; GCN-NEXT: s_mov_b64 s[6:7], -1 -; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] ; GCN-NEXT: s_cbranch_execnz .LBB0_3 ; GCN-NEXT: s_branch .LBB0_4 @@ -173,7 +172,6 @@ ; GCN-NEXT: ; %bb.15: ; %LeafBlock9 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GCN-NEXT: ; %bb.16: ; %do.body.i.i.i.i ; GCN-NEXT: s_mov_b64 s[4:5], exec diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -34,7 +34,6 @@ ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_movk_i32 s0, 0xffc ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 @@ -71,7 +70,6 @@ ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: s_mov_b32 s4, 0x40000 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -90,7 +88,6 @@ ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 @@ -237,7 +234,6 @@ ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_movk_i32 s8, 0x1004 ; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -320,7 +316,6 @@ ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -367,7 +362,6 @@ ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload @@ -391,7 +385,6 @@ ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xffc ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10551,7 +10551,6 @@ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b32 s2, 0x84800 ; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10796,7 +10795,7 @@ ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 +; GFX9-FLATSCR-NEXT: s_nop 0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload @@ -11032,7 +11031,6 @@ ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67 diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -149,6 +149,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -1652,7 +1652,6 @@ ; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-ENABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: movs r1, #0 ; THUMB-ENABLE-NEXT: mov r2, r0 ; THUMB-ENABLE-NEXT: b LBB11_3 @@ -1679,7 +1678,6 @@ ; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-DISABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: movs r1, #0 ; THUMB-DISABLE-NEXT: mov r2, r0 ; THUMB-DISABLE-NEXT: b LBB11_3 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll --- a/llvm/test/CodeGen/ARM/fpclamptosat.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -3764,7 +3764,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB48_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI48_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB48_21 @@ -4347,7 +4346,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB51_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI51_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB51_21 diff --git a/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll --- a/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ b/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -22,7 +22,7 @@ ; for.body -> for.cond.backedge (100%) ; -> cond.false.i (0%) ; CHECK: bb.1.for.body: -; CHECK: successors: %bb.2(0x80000000), %bb.4(0x00000000) +; CHECK: successors: %bb.2(0x80000000), %bb.5(0x00000000) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 diff --git a/llvm/test/CodeGen/ARM/jump-table-islands.ll b/llvm/test/CodeGen/ARM/jump-table-islands.ll --- a/llvm/test/CodeGen/ARM/jump-table-islands.ll +++ b/llvm/test/CodeGen/ARM/jump-table-islands.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s -%BigInt = type i5500 +%BigInt = type i8500 define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) { ; CHECK-LABEL: test_moved_jumptable: diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll --- a/llvm/test/CodeGen/ARM/reg_sequence.ll +++ b/llvm/test/CodeGen/ARM/reg_sequence.ll @@ -283,7 +283,6 @@ ; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: bxne lr ; CHECK-NEXT: LBB9_1: ; CHECK-NEXT: trap diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll --- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -14,9 +14,8 @@ ;