Index: llvm/include/llvm/CodeGen/CodeGenPassBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -1126,6 +1126,9 @@ if (!TM.requiresStructuredCFG()) addPass(TailDuplicatePass()); + // Cleanup of redundant immediate loads. + addPass(RedundantImmLoadsCleanupPass()); + // Copy propagation. addPass(MachineCopyPropagationPass()); } Index: llvm/include/llvm/CodeGen/MachinePassRegistry.def =================================================================== --- llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -148,6 +148,7 @@ DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ()) +DUMMY_MACHINE_FUNCTION_PASS("redundantimmloads", RedundantImmLoadsCleanupPass, ()) DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ()) DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ()) Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -331,6 +331,10 @@ /// machine instructions. extern char &MachineCopyPropagationID; + /// RedundantImmLoadsCleanup - This pass removes redundant identical + /// instructions after register allocation and rematerialization. + extern char &RedundantImmLoadsCleanupID; + /// PeepholeOptimizer - This pass performs peephole optimizations - /// like extension and comparison eliminations. extern char &PeepholeOptimizerID; Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -287,6 +287,7 @@ void initializeMachineCSEPass(PassRegistry&); void initializeMachineCombinerPass(PassRegistry&); void initializeMachineCopyPropagationPass(PassRegistry&); +void initializeRedundantImmLoadsCleanupPass(PassRegistry&); void initializeMachineCycleInfoPrinterPassPass(PassRegistry &); void initializeMachineCycleInfoWrapperPassPass(PassRegistry &); void initializeMachineDominanceFrontierPass(PassRegistry&); Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -188,6 +188,7 @@ RegUsageInfoCollector.cpp RegUsageInfoPropagate.cpp ReplaceWithVeclib.cpp + RedundantImmLoadsCleanup.cpp ResetMachineFunctionPass.cpp RegisterBank.cpp RegisterBankInfo.cpp Index: llvm/lib/CodeGen/CodeGen.cpp =================================================================== --- llvm/lib/CodeGen/CodeGen.cpp +++ llvm/lib/CodeGen/CodeGen.cpp @@ -100,6 +100,7 @@ initializeProcessImplicitDefsPass(Registry); initializeRABasicPass(Registry); initializeRAGreedyPass(Registry); + initializeRedundantImmLoadsCleanupPass(Registry); initializeRegAllocFastPass(Registry); initializeRegUsageInfoCollectorPass(Registry); initializeRegUsageInfoPropagationPass(Registry); Index: llvm/lib/CodeGen/RedundantImmLoadsCleanup.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/RedundantImmLoadsCleanup.cpp @@ -0,0 +1,285 @@ +//==- RedundantImmLoadsCleanup.cpp - Redundant Imm Loads Cleanup Pass -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This simple pass removes any identical and redundant immediate or address +// loads to the same register. The immediate loads removed can originally be +// the result of rematerialization, while the addresses are redundant frame +// addressing anchor points created during Frame Indices elimination. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BreadthFirstIterator.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "redundantimmloads" + +STATISTIC(NumRemoved, "Number of redundant instructions removed."); + +namespace { + +using MBBSet = SmallPtrSet; + +class RedundantImmLoadsCleanup : public MachineFunctionPass { + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + const MachineRegisterInfo *MRI; + +public: + static char ID; // Pass identification, replacement for typeid + + RedundantImmLoadsCleanup() : MachineFunctionPass(ID) { + initializeRedundantImmLoadsCleanupPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + // Data structures to map regs to definitions per MBB. + typedef std::map Reg2DefMap; + typedef std::map MBB2RegDefsMap; + bool visitBlock(MachineBasicBlock *MBB, MBBSet &Visited, + MBB2RegDefsMap &RegDefs); +}; + +} // end anonymous namespace + +char RedundantImmLoadsCleanup::ID = 0; + +char &llvm::RedundantImmLoadsCleanupID = RedundantImmLoadsCleanup::ID; + +INITIALIZE_PASS(RedundantImmLoadsCleanup, DEBUG_TYPE, + "Redundant Imm Loads Cleanup Pass", false, false) + +bool RedundantImmLoadsCleanup::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + bool Changed = false; + + TRI = MF.getSubtarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); + + MBBSet Visited; + MBB2RegDefsMap RegDefs; + auto allPredsVisited = [&Visited](MachineBasicBlock *MBB) { + for (MachineBasicBlock *Pred : MBB->predecessors()) + if (!Visited.count(Pred)) + return false; + return true; + }; + + // Try to visit all blocks in an order so that all predecessors of an MBB + // were visited before, in order to reuse definitions from them. + bool Progress = true; + while (Progress) { + Progress = false; + for (auto *CurrMBB : breadth_first(&MF)) + if (!Visited.count(CurrMBB) && allPredsVisited(CurrMBB)) { + Changed |= visitBlock(CurrMBB, Visited, RegDefs); + Progress = true; + } + + if (!Progress) + for (auto *CurrMBB : breadth_first(&MF)) + if (!Visited.count(CurrMBB)) { + Changed |= visitBlock(CurrMBB, Visited, RegDefs); + Progress = true; + break; + } + } + + return Changed; +} + +// Clear any previous kill flag on Reg found before I in MBB. Walk backwards +// in MBB and if needed continue in predecessors as well until a use/def of +// Reg is encountered. +static void clearKillsForDef(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + Register Reg, + MachineInstr *MI, + MBBSet &Visited, + const TargetRegisterInfo *TRI) { + Visited.insert(MBB); + // Find the instruction that was previously the last user and clear it's + // kill flag for Reg. + while (I != MBB->begin()) { + I--; + for (auto &MO : I->operands()) + if (MO.isReg()) { + if (MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) { + assert (I->isIdenticalTo(*MI) && "Broken redundancy assumption."); + return; + } + if (MO.readsReg() && MO.getReg() == Reg) { + MO.setIsKill(false); + return; + } + } + } + + // If earlier def is not in MBB, continue in predecessors. + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + assert(MBB->pred_size() && "Predecessor def not found!"); + for (MachineBasicBlock *Pred : MBB->predecessors()) + if (!Visited.count(Pred)) + clearKillsForDef(Pred, Pred->end(), Reg, MI, Visited, TRI); +} + +static void removeRedundantDef(MachineInstr *MI, + const TargetRegisterInfo *TRI) { + Register Reg = MI->getOperand(0).getReg(); + MBBSet Visited; + clearKillsForDef(MI->getParent(), MI->getIterator(), Reg, MI, Visited, TRI); + MI->eraseFromParent(); + ++NumRemoved; +} + +// Return true if MI is a potential candidate for reuse/removal and if so +// also the register it defines in DefedReg. A candidate is a simple +// instruction that does not touch memory, has only one register definition +// and the only reg it may use is FrameReg. Typically this is an immediate +// load or a load-address instruction. +static bool isCandidate(const MachineInstr *MI, Register &DefedReg, + Register FrameReg) { + DefedReg = 0; + bool SawStore = true; + if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() || + MI->isInlineAsm()) + return false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + if (MO.isDef()) { + if (i == 0 && !MO.isImplicit() && !MO.isDead()) + DefedReg = MO.getReg(); + else + return false; + } + else if (MO.getReg() && MO.getReg() != FrameReg) + return false; + } + else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() || + MO.isGlobal() || MO.isSymbol())) + return false; + } + return DefedReg != 0; +} + +bool RedundantImmLoadsCleanup:: +visitBlock(MachineBasicBlock *MBB, MBBSet &Visited, MBB2RegDefsMap &RegDefs) { + bool Changed = false; + Visited.insert(MBB); + + Reg2DefMap &MBBDefs = RegDefs[MBB]; + + // Find reusable definitions in the predecessor(s). + if (MBB->pred_size() > 0) { + MachineBasicBlock *FirstPred = *MBB->pred_begin(); + for (auto I : RegDefs[FirstPred]) { + Register Reg = I.first; + MachineInstr *DefMI = I.second; + if (!DefMI) + continue; + bool AllSame = true; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (Pred == FirstPred) + continue; + if (RegDefs[Pred][Reg] == nullptr || + !DefMI->isIdenticalTo(*RegDefs[Pred][Reg])) { + AllSame = false; + break; + } + } + if (AllSame) { + MBBDefs[Reg] = DefMI; + LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in MBB#" + << MBB->getNumber() << ": " << *DefMI;); + } + } + } + + // Process MBB. + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI =MF->getSubtarget().getRegisterInfo(); + Register FrameReg = TRI->getFrameRegister(*MF); + for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end();) { + MachineInstr *MI = &*(I++); + + // Clear map if the FrameReg is modified. + if (MI->modifiesRegister(FrameReg, TRI)) { + MBBDefs.clear(); + continue; + } + + Register DefedReg; + bool IsCandidate = isCandidate(MI, DefedReg, FrameReg); + + // Check for an earlier identical and reusable instruction. + if (IsCandidate && MBBDefs[DefedReg] != nullptr && + MBBDefs[DefedReg]->isIdenticalTo(*MI)) { + LLVM_DEBUG(dbgs() << "Removing redundant instruction in MBB#" + << MBB->getNumber() << ": " << *MI;); + removeRedundantDef(MI, TRI); + Changed = true; + continue; + } + + // Clear any entries in map that MI clobbers. + for (auto DefI : MBBDefs) { + Register Reg = DefI.first; + if (DefI.second && MI->modifiesRegister(Reg, TRI)) + MBBDefs[Reg] = nullptr; + } + + // Record this MI for potential later re-use. + if (IsCandidate) { + LLVM_DEBUG(dbgs() << "Found interesting instruction in MBB#" + << MBB->getNumber() << ": " << *MI;); + MBBDefs[DefedReg] = MI; + } + } + + return Changed; +} Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1501,6 +1501,9 @@ /// Add passes that optimize machine instructions after register allocation. void TargetPassConfig::addMachineLateOptimization() { + // Cleanup of redundant immediate loads. + addPass(&RedundantImmLoadsCleanupID); + // Branch folding must be run after regalloc and prolog/epilog insertion. addPass(&BranchFolderPassID); Index: llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -300,6 +300,7 @@ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); + disablePass(&RedundantImmLoadsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&TailDuplicateID); disablePass(&StackMapLivenessID); Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -486,6 +486,7 @@ // them. // These functions all require the NoVRegs property. + disablePass(&RedundantImmLoadsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&PostRAMachineSinkingID); disablePass(&PostRASchedulerID); Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -177,6 +177,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Redundant Imm Loads Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication Index: llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll =================================================================== --- llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll +++ llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll @@ -29,14 +29,8 @@ ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x9, [x9] ; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: Lloh6: -; CHECK-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE ; CHECK-NEXT: stur x9, [x29, #-8] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF] ; CHECK-NEXT: ldur x9, [x29, #-8] -; CHECK-NEXT: Lloh8: -; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %entry @@ -46,7 +40,6 @@ ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %entry ; CHECK-NEXT: bl ___stack_chk_fail -; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh6, Lloh7, Lloh8 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4 entry: Index: llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -43,26 +43,23 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w1, #1 ; CHECK-NEXT: mov w2, #2 -; CHECK-NEXT: st1d { z16.d }, p0, [x9] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w3, #3 ; CHECK-NEXT: mov w4, #4 ; CHECK-NEXT: mov w5, #5 ; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w7, #7 -; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1d { z19.d }, p0, [x9, #3, mul vl] ; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl callee2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -158,50 +158,29 @@ ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:8 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:72 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:80 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:24 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:88 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:32 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:96 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:104 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:48 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:112 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:56 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:120 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:64 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:72 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:80 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:88 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:96 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:104 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:112 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:120 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:128 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s33 offset:8 -; FLATSCR-NEXT: s_mov_b32 s33, 0 +; FLATSCR-NEXT: s_nop 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s33 offset:16 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s33 offset:24 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s33 offset:32 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s33 offset:40 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s33 offset:48 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s33 offset:56 -; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s33 offset:64 ; FLATSCR-NEXT: s_movk_i32 s32, 0x50 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -499,7 +499,6 @@ ; GFX940-NEXT: scratch_store_dword v1, v0, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -572,7 +571,6 @@ ; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -946,7 +946,6 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz .LBB13_2 Index: llvm/test/CodeGen/AMDGPU/cc-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cc-update.ll +++ llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -410,7 +410,6 @@ ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND -; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -427,7 +426,6 @@ ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x40000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -442,8 +440,6 @@ ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill -; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload Index: llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -399,18 +399,14 @@ ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_hi offset:6 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -463,22 +459,15 @@ ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:6 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:8 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: s_clause 0x1 ; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 -; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -21,11 +21,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 ; GFX9-NEXT: s_endpgm ; @@ -68,11 +65,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 ; GFX9-PAL-NEXT: s_endpgm ; @@ -111,14 +105,8 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 ; GFX1010-PAL-NEXT: s_endpgm ; @@ -786,13 +774,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 ; GFX9-NEXT: s_endpgm ; @@ -838,13 +822,9 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 ; GFX9-PAL-NEXT: s_endpgm ; @@ -886,16 +866,9 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 ; GFX1010-PAL-NEXT: s_endpgm ; @@ -1635,11 +1608,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_endpgm ; @@ -1661,11 +1631,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_endpgm ; @@ -1691,11 +1658,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-PAL-NEXT: s_endpgm ; @@ -1711,11 +1675,8 @@ ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX940-NEXT: s_endpgm ; @@ -1743,14 +1704,8 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX1010-PAL-NEXT: s_endpgm ; @@ -1777,11 +1732,8 @@ ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX1030-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3325,7 +3277,6 @@ ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 16 @@ -3380,7 +3331,6 @@ ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -361,6 +361,7 @@ ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Shrink Wrapping analysis ; GCN-O1-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-NEXT: Redundant Imm Loads Cleanup Pass ; GCN-O1-NEXT: Control Flow Optimizer ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Tail Duplication @@ -647,6 +648,7 @@ ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Shrink Wrapping analysis ; GCN-O1-OPTS-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-OPTS-NEXT: Redundant Imm Loads Cleanup Pass ; GCN-O1-OPTS-NEXT: Control Flow Optimizer ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Tail Duplication @@ -935,6 +937,7 @@ ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Shrink Wrapping analysis ; GCN-O2-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O2-NEXT: Redundant Imm Loads Cleanup Pass ; GCN-O2-NEXT: Control Flow Optimizer ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Tail Duplication @@ -1235,6 +1238,7 @@ ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Shrink Wrapping analysis ; GCN-O3-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O3-NEXT: Redundant Imm Loads Cleanup Pass ; GCN-O3-NEXT: Control Flow Optimizer ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Tail Duplication Index: llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -300,7 +300,6 @@ ; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -184,7 +184,6 @@ ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_vccz .LBB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 Index: llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -203,13 +203,10 @@ ; SI-NEXT: s_cbranch_vccz .LBB3_3 ; SI-NEXT: ; %bb.5: ; %convex.exit ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[8:9], -1 -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB3_2 ; SI-NEXT: ; %bb.6: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[4:5] ; SI-NEXT: s_cbranch_vccz .LBB3_1 ; SI-NEXT: ; %bb.7: ; %if.else @@ -278,13 +275,10 @@ ; FLAT-NEXT: s_cbranch_vccz .LBB3_3 ; FLAT-NEXT: ; %bb.5: ; %convex.exit ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[8:9], -1 -; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[2:3] ; FLAT-NEXT: s_cbranch_vccz .LBB3_2 ; FLAT-NEXT: ; %bb.6: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[4:5] ; FLAT-NEXT: s_cbranch_vccz .LBB3_1 ; FLAT-NEXT: ; %bb.7: ; %if.else Index: llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -150,7 +150,6 @@ ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 ; MUBUF: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Reload - ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload ; Force %a to spill with no free SGPRs Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -115,8 +115,7 @@ ; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x ; GFX9-FLATSCR: s_waitcnt vmcnt(0) ; FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill -; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x -; FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF2]] ; 16-byte Folded Reload +; FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF1]] ; 16-byte Folded Reload define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) Index: llvm/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -145,6 +145,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Redundant Imm Loads Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication Index: llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -1652,7 +1652,6 @@ ; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-ENABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: movs r1, #0 ; THUMB-ENABLE-NEXT: mov r2, r0 ; THUMB-ENABLE-NEXT: b LBB11_3 @@ -1679,7 +1678,6 @@ ; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-DISABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: movs r1, #0 ; THUMB-DISABLE-NEXT: mov r2, r0 ; THUMB-DISABLE-NEXT: b LBB11_3 Index: llvm/test/CodeGen/ARM/fpclamptosat.ll =================================================================== --- llvm/test/CodeGen/ARM/fpclamptosat.ll +++ llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -3764,7 +3764,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB48_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI48_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB48_21 @@ -4347,7 +4346,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB51_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI51_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB51_21 Index: llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll =================================================================== --- llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -22,7 +22,7 @@ ; for.body -> for.cond.backedge (100%) ; -> cond.false.i (0%) ; CHECK: bb.1.for.body: -; CHECK: successors: %bb.2(0x80000000), %bb.4(0x00000000) +; CHECK: successors: %bb.2(0x80000000), %bb.5(0x00000000) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 Index: llvm/test/CodeGen/ARM/jump-table-islands.ll =================================================================== --- llvm/test/CodeGen/ARM/jump-table-islands.ll +++ llvm/test/CodeGen/ARM/jump-table-islands.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s -%BigInt = type i5500 +%BigInt = type i8500 define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) { ; CHECK-LABEL: test_moved_jumptable: Index: llvm/test/CodeGen/ARM/reg_sequence.ll =================================================================== --- llvm/test/CodeGen/ARM/reg_sequence.ll +++ llvm/test/CodeGen/ARM/reg_sequence.ll @@ -283,7 +283,6 @@ ; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: bxne lr ; CHECK-NEXT: LBB9_1: ; CHECK-NEXT: trap Index: llvm/test/CodeGen/BPF/objdump_cond_op_2.ll =================================================================== --- llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -14,9 +14,8 @@ ;