Index: llvm/include/llvm/CodeGen/CodeGenPassBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -1130,6 +1130,9 @@ if (!TM.requiresStructuredCFG()) addPass(TailDuplicatePass()); + // Cleanup of redundant (identical) address/immediate loads. + addPass(MachineLateInstrsCleanupPass()); + // Copy propagation. addPass(MachineCopyPropagationPass()); } Index: llvm/include/llvm/CodeGen/MachinePassRegistry.def =================================================================== --- llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -150,6 +150,7 @@ DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ()) +DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass, ()) DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ()) DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ()) Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -330,6 +330,10 @@ MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr); + /// MachineLateInstrsCleanup - This pass removes redundant identical + /// instructions after register allocation and rematerialization. + extern char &MachineLateInstrsCleanupID; + /// PeepholeOptimizer - This pass performs peephole optimizations - /// like extension and comparison eliminations. extern char &PeepholeOptimizerID; Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -280,6 +280,7 @@ void initializeMachineDominatorTreePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeMachineFunctionSplitterPass(PassRegistry &); +void initializeMachineLateInstrsCleanupPass(PassRegistry&); void initializeMachineLICMPass(PassRegistry&); void initializeMachineLoopInfoPass(PassRegistry&); void initializeMachineModuleInfoWrapperPassPass(PassRegistry &); Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -117,6 +117,7 @@ MachineFunctionSplitter.cpp MachineInstrBundle.cpp MachineInstr.cpp + MachineLateInstrsCleanup.cpp MachineLICM.cpp MachineLoopInfo.cpp MachineLoopUtils.cpp Index: llvm/lib/CodeGen/CodeGen.cpp =================================================================== --- llvm/lib/CodeGen/CodeGen.cpp +++ llvm/lib/CodeGen/CodeGen.cpp @@ -77,6 +77,7 @@ initializeMachineCycleInfoWrapperPassPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachineFunctionPrinterPassPass(Registry); + initializeMachineLateInstrsCleanupPass(Registry); initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoWrapperPassPass(Registry); Index: llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp @@ -0,0 +1,288 @@ +//==--- MachineLateInstrsCleanup.cpp - Late Instructions Cleanup Pass -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This simple pass removes any identical and redundant immediate or address +// loads to the same register. The immediate loads removed can originally be +// the result of rematerialization, while the addresses are redundant frame +// addressing anchor points created during Frame Indices elimination. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/BreadthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-latecleanup" + +STATISTIC(NumRemoved, "Number of redundant instructions removed."); + +namespace { + +class MachineLateInstrsCleanup : public MachineFunctionPass { + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + + // Data structures to map regs to their definitions per MBB. + using Reg2DefMap = std::map; + std::vector RegDefs; + + // Set of visited MBBs. + BitVector Visited; + + // Return true if all predecessors of MBB have been visited. + bool allPredsVisited(MachineBasicBlock *MBB); + + // Walk through the instructions in MBB and remove any redundant + // instructions. + bool processBlock(MachineBasicBlock *MBB); + + // Visit MBB and then any of its successors that become ready. + bool visitBlock(MachineBasicBlock *MBB); + +public: + static char ID; // Pass identification, replacement for typeid + + MachineLateInstrsCleanup() : MachineFunctionPass(ID) { + initializeMachineLateInstrsCleanupPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } +}; + +} // end anonymous namespace + +char MachineLateInstrsCleanup::ID = 0; + +char &llvm::MachineLateInstrsCleanupID = MachineLateInstrsCleanup::ID; + +INITIALIZE_PASS(MachineLateInstrsCleanup, DEBUG_TYPE, + "Machine Late Instructions Cleanup Pass", false, false) + +bool MachineLateInstrsCleanup::allPredsVisited(MachineBasicBlock *MBB) { + return llvm::all_of(MBB->predecessors(), [&](const MachineBasicBlock *Pred) { + return Visited.test(Pred->getNumber()); + }); +} + +bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + bool Changed = false; + + TRI = MF.getSubtarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + + RegDefs.clear(); + RegDefs.resize(MF.getNumBlockIDs()); + Visited.clear(); + Visited.resize(MF.getNumBlockIDs()); + + // Visit all MBBs in an order that maximises the reuse from predecessors. + // Put the ones that are not ready in Pending and handle them in a second + // round. + std::vector Pending; + for (auto *MBB : breadth_first(&MF)) + if (!Visited.test(MBB->getNumber())) { + if (allPredsVisited(MBB)) + Changed |= visitBlock(MBB); + else + Pending.push_back(MBB); + } + + for (auto *MBB : Pending) + if (!Visited.test(MBB->getNumber())) + Changed |= visitBlock(MBB); + + return Changed; +} + +// Clear any previous kill flag on Reg found before I in MBB. Walk backwards +// in MBB and if needed continue in predecessors until a use/def of Reg is +// encountered. This seems to be faster than in practice than tracking kill +// flags in a map. +static void clearKillsForDef(Register Reg, MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + BitVector &VisitedPreds, + const TargetRegisterInfo *TRI) { + VisitedPreds.set(MBB->getNumber()); + while (I != MBB->begin()) { + I--; + for (auto &MO : I->operands()) + if (MO.isReg()) { + if (MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) + return; + if (MO.readsReg() && MO.getReg() == Reg) { + MO.setIsKill(false); + return; + } + } + } + + // If earlier def is not in MBB, continue in predecessors. + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + assert(!MBB->pred_empty() && "Predecessor def not found!"); + for (MachineBasicBlock *Pred : MBB->predecessors()) + if (!VisitedPreds.test(Pred->getNumber())) + clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds, TRI); +} + +static void removeRedundantDef(MachineInstr *MI, + const TargetRegisterInfo *TRI) { + Register Reg = MI->getOperand(0).getReg(); + BitVector VisitedPreds(MI->getParent()->getParent()->getNumBlockIDs()); + clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds, TRI); + MI->eraseFromParent(); + ++NumRemoved; +} + +// Return true if MI is a potential candidate for reuse/removal and if so +// also the register it defines in DefedReg. A candidate is a simple +// instruction that does not touch memory, has only one register definition +// and the only reg it may use is FrameReg. Typically this is an immediate +// load or a load-address instruction. +static bool isCandidate(const MachineInstr *MI, Register &DefedReg, + Register FrameReg) { + DefedReg = MCRegister::NoRegister; + bool SawStore = true; + if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() || + MI->isInlineAsm()) + return false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + if (MO.isDef()) { + if (i == 0 && !MO.isImplicit() && !MO.isDead()) + DefedReg = MO.getReg(); + else + return false; + } else if (MO.getReg() && MO.getReg() != FrameReg) + return false; + } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() || + MO.isGlobal() || MO.isSymbol())) + return false; + } + return DefedReg.isValid(); +} + +bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { + bool Changed = false; + Visited.set(MBB->getNumber()); + + Reg2DefMap &MBBDefs = RegDefs[MBB->getNumber()]; + + // Find reusable definitions in the predecessor(s). + if (!MBB->pred_empty()) { + MachineBasicBlock *FirstPred = *MBB->pred_begin(); + for (auto I : RegDefs[FirstPred->getNumber()]) { + Register Reg = I.first; + MachineInstr *DefMI = I.second; + if (llvm::all_of(MBB->predecessors(), [&](const MachineBasicBlock *Pred) { + if (Pred == FirstPred) + return true; + auto PredDefI = RegDefs[Pred->getNumber()].find(Reg); + return PredDefI != RegDefs[Pred->getNumber()].end() && + DefMI->isIdenticalTo(*PredDefI->second); + })) { + MBBDefs[Reg] = DefMI; + LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in MBB#" + << MBB->getNumber() << ": " << *DefMI;); + } + } + } + + // Process MBB. + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + Register FrameReg = TRI->getFrameRegister(*MF); + for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end();) { + MachineInstr *MI = &*(I++); + + // Clear map if the FrameReg is modified. + if (MI->modifiesRegister(FrameReg, TRI)) { + MBBDefs.clear(); + continue; + } + + Register DefedReg; + bool IsCandidate = isCandidate(MI, DefedReg, FrameReg); + + // Check for an earlier identical and reusable instruction. + if (IsCandidate) { + auto DefI = MBBDefs.find(DefedReg); + if (DefI != MBBDefs.end() && MI->isIdenticalTo(*DefI->second)) { + LLVM_DEBUG(dbgs() << "Removing redundant instruction in MBB#" + << MBB->getNumber() << ": " << *MI;); + removeRedundantDef(MI, TRI); + Changed = true; + continue; + } + } + + // Clear any entries in map that MI clobbers. + for (auto DefI = MBBDefs.begin(); DefI != MBBDefs.end();) { + Register Reg = DefI->first; + if (MI->modifiesRegister(Reg, TRI)) + DefI = MBBDefs.erase(DefI); + else + ++DefI; + } + + // Record this MI for potential later reuse. + if (IsCandidate) { + LLVM_DEBUG(dbgs() << "Found interesting instruction in MBB#" + << MBB->getNumber() << ": " << *MI;); + MBBDefs[DefedReg] = MI; + } + } + + return Changed; +} + +bool MachineLateInstrsCleanup::visitBlock(MachineBasicBlock *PendingMBB) { + bool Changed = false; + + std::queue Worklist; + Worklist.push(PendingMBB); + while (!Worklist.empty()) { + MachineBasicBlock *CurrMBB = Worklist.front(); + Worklist.pop(); + Changed |= processBlock(CurrMBB); + for (MachineBasicBlock *SuccMBB : CurrMBB->successors()) + if (!Visited.test(SuccMBB->getNumber()) && allPredsVisited(SuccMBB)) + Worklist.push(SuccMBB); + } + + return Changed; +} Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1520,6 +1520,9 @@ /// Add passes that optimize machine instructions after register allocation. void TargetPassConfig::addMachineLateOptimization() { + // Cleanup of redundant immediate/address loads. + addPass(&MachineLateInstrsCleanupID); + // Branch folding must be run after regalloc and prolog/epilog insertion. addPass(&BranchFolderPassID); Index: llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -300,6 +300,7 @@ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); + disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&TailDuplicateID); disablePass(&StackMapLivenessID); Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -486,6 +486,7 @@ // them. // These functions all require the NoVRegs property. + disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&PostRAMachineSinkingID); disablePass(&PostRASchedulerID); Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -192,6 +192,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication Index: llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll =================================================================== --- llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll +++ llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll @@ -29,14 +29,8 @@ ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x9, [x9] ; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: Lloh6: -; CHECK-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE ; CHECK-NEXT: stur x9, [x29, #-8] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF] ; CHECK-NEXT: ldur x9, [x29, #-8] -; CHECK-NEXT: Lloh8: -; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %entry @@ -46,7 +40,6 @@ ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %entry ; CHECK-NEXT: bl ___stack_chk_fail -; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh6, Lloh7, Lloh8 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4 entry: Index: llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -59,26 +59,23 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w1, #1 ; CHECK-NEXT: mov w2, #2 -; CHECK-NEXT: st1d { z16.d }, p0, [x9] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w3, #3 ; CHECK-NEXT: mov w4, #4 ; CHECK-NEXT: mov w5, #5 ; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w7, #7 -; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1d { z19.d }, p0, [x9, #3, mul vl] ; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl callee2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -158,50 +158,29 @@ ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:8 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:72 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:80 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:24 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:88 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:32 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:96 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:104 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:48 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:112 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:56 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:120 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:64 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:72 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:80 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:88 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:96 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:104 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:112 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:120 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:128 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s33 offset:8 -; FLATSCR-NEXT: s_mov_b32 s33, 0 +; FLATSCR-NEXT: s_nop 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s33 offset:16 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s33 offset:24 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s33 offset:32 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s33 offset:40 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s33 offset:48 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s33 offset:56 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s33 offset:64 ; FLATSCR-NEXT: s_movk_i32 s32, 0x50 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -594,7 +594,6 @@ ; GFX940-NEXT: scratch_store_dword v1, v0, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -615,7 +614,6 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_load_b32 v0, v2, vcc_lo glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm @@ -686,7 +684,6 @@ ; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -702,7 +699,6 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1354,7 +1354,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz .LBB13_2 Index: llvm/test/CodeGen/AMDGPU/cc-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cc-update.ll +++ llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -537,7 +537,6 @@ ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND -; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -554,7 +553,6 @@ ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x40000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -569,8 +567,6 @@ ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill -; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload @@ -585,7 +581,6 @@ ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill -; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: ;;#ASMSTART ; GFX1100-NEXT: ;;#ASMEND ; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload Index: llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -502,18 +502,14 @@ ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_hi offset:6 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -565,22 +561,15 @@ ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:6 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:8 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: s_clause 0x1 ; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -23,11 +23,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:52 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:36 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:20 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 ; GFX9-NEXT: s_endpgm ; @@ -87,11 +84,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:52 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:36 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:20 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 ; GFX9-PAL-NEXT: s_endpgm ; @@ -130,14 +124,8 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:52 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:36 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:4 ; GFX1010-PAL-NEXT: s_endpgm ; @@ -982,13 +970,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:260 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:276 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:292 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 ; GFX9-NEXT: s_endpgm ; @@ -1053,13 +1037,9 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:260 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:276 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:292 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 ; GFX9-PAL-NEXT: s_endpgm ; @@ -1101,16 +1081,9 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:260 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:276 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:308 ; GFX1010-PAL-NEXT: s_endpgm ; @@ -2028,11 +2001,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_endpgm ; @@ -2054,11 +2024,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_endpgm ; @@ -2073,12 +2040,10 @@ ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo -; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 -; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 -; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2105,11 +2070,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-PAL-NEXT: s_endpgm ; @@ -2125,11 +2087,8 @@ ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX940-NEXT: s_endpgm ; @@ -2157,14 +2116,8 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX1010-PAL-NEXT: s_endpgm ; @@ -2191,11 +2144,8 @@ ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX1030-PAL-NEXT: s_endpgm ; @@ -2210,12 +2160,10 @@ ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo -; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 -; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 -; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm @@ -4141,7 +4089,6 @@ ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 16 @@ -4216,7 +4163,6 @@ ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -372,6 +372,7 @@ ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Shrink Wrapping analysis ; GCN-O1-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-NEXT: Control Flow Optimizer ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Tail Duplication @@ -667,6 +668,7 @@ ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Shrink Wrapping analysis ; GCN-O1-OPTS-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-OPTS-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-OPTS-NEXT: Control Flow Optimizer ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Tail Duplication @@ -964,6 +966,7 @@ ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Shrink Wrapping analysis ; GCN-O2-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O2-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O2-NEXT: Control Flow Optimizer ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Tail Duplication @@ -1274,6 +1277,7 @@ ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Shrink Wrapping analysis ; GCN-O3-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O3-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O3-NEXT: Control Flow Optimizer ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Tail Duplication Index: llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -300,7 +300,6 @@ ; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -188,7 +188,6 @@ ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 Index: llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -186,8 +186,6 @@ ; SI-NEXT: s_branch .LBB3_3 ; SI-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_mov_b64 s[10:11], -1 -; SI-NEXT: s_mov_b64 s[12:13], -1 ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_and_b64 vcc, exec, s[12:13] @@ -205,7 +203,6 @@ ; SI-NEXT: s_cbranch_vccz .LBB3_1 ; SI-NEXT: ; %bb.5: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], -1 ; SI-NEXT: s_mov_b64 vcc, s[4:5] ; SI-NEXT: s_cbranch_vccz .LBB3_7 ; SI-NEXT: ; %bb.6: ; %if.else @@ -261,8 +258,6 @@ ; FLAT-NEXT: s_branch .LBB3_3 ; FLAT-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_mov_b64 s[8:9], 0 -; FLAT-NEXT: s_mov_b64 s[10:11], -1 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 ; FLAT-NEXT: .LBB3_2: ; %Flow ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] @@ -280,7 +275,6 @@ ; FLAT-NEXT: s_cbranch_vccz .LBB3_1 ; FLAT-NEXT: ; %bb.5: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[4:5] ; FLAT-NEXT: s_cbranch_vccz .LBB3_7 ; FLAT-NEXT: ; %bb.6: ; %if.else Index: llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -60,7 +60,6 @@ ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: .LBB0_4: ; %Flow3 ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec @@ -103,7 +102,6 @@ ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 -; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 entry: Index: llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -150,7 +150,6 @@ ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 ; MUBUF: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Reload - ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload ; Force %a to spill with no free SGPRs Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -115,8 +115,7 @@ ; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x ; GFX9-FLATSCR: s_waitcnt vmcnt(0) ; FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill -; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x -; FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF2]] ; 16-byte Folded Reload +; FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF1]] ; 16-byte Folded Reload define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) Index: llvm/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -147,6 +147,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication Index: llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -1652,7 +1652,6 @@ ; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-ENABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: movs r1, #0 ; THUMB-ENABLE-NEXT: mov r2, r0 ; THUMB-ENABLE-NEXT: b LBB11_3 @@ -1679,7 +1678,6 @@ ; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-DISABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: movs r1, #0 ; THUMB-DISABLE-NEXT: mov r2, r0 ; THUMB-DISABLE-NEXT: b LBB11_3 Index: llvm/test/CodeGen/ARM/fpclamptosat.ll =================================================================== --- llvm/test/CodeGen/ARM/fpclamptosat.ll +++ llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -3764,7 +3764,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB48_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI48_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB48_21 @@ -4347,7 +4346,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB51_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI51_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB51_21 Index: llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll =================================================================== --- llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -22,7 +22,7 @@ ; for.body -> for.cond.backedge (100%) ; -> cond.false.i (0%) ; CHECK: bb.1.for.body: -; CHECK: successors: %bb.2(0x80000000), %bb.4(0x00000000) +; CHECK: successors: %bb.2(0x80000000), %bb.5(0x00000000) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 Index: llvm/test/CodeGen/ARM/jump-table-islands.ll =================================================================== --- llvm/test/CodeGen/ARM/jump-table-islands.ll +++ llvm/test/CodeGen/ARM/jump-table-islands.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s -%BigInt = type i5500 +%BigInt = type i8500 define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) { ; CHECK-LABEL: test_moved_jumptable: Index: llvm/test/CodeGen/ARM/reg_sequence.ll =================================================================== --- llvm/test/CodeGen/ARM/reg_sequence.ll +++ llvm/test/CodeGen/ARM/reg_sequence.ll @@ -283,7 +283,6 @@ ; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: bxne lr ; CHECK-NEXT: LBB9_1: ; CHECK-NEXT: trap Index: llvm/test/CodeGen/BPF/objdump_cond_op_2.ll =================================================================== --- llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -14,9 +14,8 @@ ;