Index: llvm/include/llvm/CodeGen/CodeGenPassBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -1130,6 +1130,9 @@ if (!TM.requiresStructuredCFG()) addPass(TailDuplicatePass()); + // Cleanup of redundant (identical) address/immediate loads. + addPass(MachineLateInstrsCleanupPass()); + // Copy propagation. addPass(MachineCopyPropagationPass()); } Index: llvm/include/llvm/CodeGen/MachinePassRegistry.def =================================================================== --- llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -150,6 +150,7 @@ DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ()) +DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass, ()) DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ()) DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ()) Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -334,6 +334,10 @@ MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr); + /// MachineLateInstrsCleanup - This pass removes redundant identical + /// instructions after register allocation and rematerialization. + extern char &MachineLateInstrsCleanupID; + /// PeepholeOptimizer - This pass performs peephole optimizations - /// like extension and comparison eliminations. extern char &PeepholeOptimizerID; Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -276,6 +276,7 @@ void initializeMachineDominatorTreePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeMachineFunctionSplitterPass(PassRegistry &); +void initializeMachineLateInstrsCleanupPass(PassRegistry&); void initializeMachineLICMPass(PassRegistry&); void initializeMachineLoopInfoPass(PassRegistry&); void initializeMachineModuleInfoWrapperPassPass(PassRegistry &); Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -118,6 +118,7 @@ MachineFunctionSplitter.cpp MachineInstrBundle.cpp MachineInstr.cpp + MachineLateInstrsCleanup.cpp MachineLICM.cpp MachineLoopInfo.cpp MachineLoopUtils.cpp Index: llvm/lib/CodeGen/CodeGen.cpp =================================================================== --- llvm/lib/CodeGen/CodeGen.cpp +++ llvm/lib/CodeGen/CodeGen.cpp @@ -77,6 +77,7 @@ initializeMachineCycleInfoWrapperPassPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachineFunctionPrinterPassPass(Registry); + initializeMachineLateInstrsCleanupPass(Registry); initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoWrapperPassPass(Registry); Index: llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp @@ -0,0 +1,239 @@ +//==--- MachineLateInstrsCleanup.cpp - Late Instructions Cleanup Pass -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This simple pass removes any identical and redundant immediate or address +// loads to the same register. The immediate loads removed can originally be +// the result of rematerialization, while the addresses are redundant frame +// addressing anchor points created during Frame Indices elimination. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-latecleanup" + +STATISTIC(NumRemoved, "Number of redundant instructions removed."); + +namespace { + +class MachineLateInstrsCleanup : public MachineFunctionPass { + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + + // Data structures to map regs to their definitions per MBB. + using Reg2DefMap = std::map; + std::vector RegDefs; + + // Walk through the instructions in MBB and remove any redundant + // instructions. + bool processBlock(MachineBasicBlock *MBB); + +public: + static char ID; // Pass identification, replacement for typeid + + MachineLateInstrsCleanup() : MachineFunctionPass(ID) { + initializeMachineLateInstrsCleanupPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } +}; + +} // end anonymous namespace + +char MachineLateInstrsCleanup::ID = 0; + +char &llvm::MachineLateInstrsCleanupID = MachineLateInstrsCleanup::ID; + +INITIALIZE_PASS(MachineLateInstrsCleanup, DEBUG_TYPE, + "Machine Late Instructions Cleanup Pass", false, false) + +bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + bool Changed = false; + + TRI = MF.getSubtarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + + RegDefs.clear(); + RegDefs.resize(MF.getNumBlockIDs()); + + // Visit all MBBs in an order that maximises the reuse from predecessors. + ReversePostOrderTraversal RPOT(&MF); + for (MachineBasicBlock *MBB : RPOT) + Changed |= processBlock(MBB); + + return Changed; +} + +// Clear any previous kill flag on Reg found before I in MBB. Walk backwards +// in MBB and if needed continue in predecessors until a use/def of Reg is +// encountered. This seems to be faster in practice than tracking kill flags +// in a map. +static void clearKillsForDef(Register Reg, MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + BitVector &VisitedPreds, + const TargetRegisterInfo *TRI) { + VisitedPreds.set(MBB->getNumber()); + while (I != MBB->begin()) { + I--; + bool Found = false; + for (auto &MO : I->operands()) + if (MO.isReg() && TRI->regsOverlap(MO.getReg(), Reg)) { + if (MO.isDef()) + return; + if (MO.readsReg()) { + MO.setIsKill(false); + Found = true; // Keep going for an implicit kill of the super-reg. + } + } + if (Found) + return; + } + + // If an earlier def is not in MBB, continue in predecessors. + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + assert(!MBB->pred_empty() && "Predecessor def not found!"); + for (MachineBasicBlock *Pred : MBB->predecessors()) + if (!VisitedPreds.test(Pred->getNumber())) + clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds, TRI); +} + +static void removeRedundantDef(MachineInstr *MI, + const TargetRegisterInfo *TRI) { + Register Reg = MI->getOperand(0).getReg(); + BitVector VisitedPreds(MI->getMF()->getNumBlockIDs()); + clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds, TRI); + MI->eraseFromParent(); + ++NumRemoved; +} + +// Return true if MI is a potential candidate for reuse/removal and if so +// also the register it defines in DefedReg. A candidate is a simple +// instruction that does not touch memory, has only one register definition +// and the only reg it may use is FrameReg. Typically this is an immediate +// load or a load-address instruction. +static bool isCandidate(const MachineInstr *MI, Register &DefedReg, + Register FrameReg) { + DefedReg = MCRegister::NoRegister; + bool SawStore = true; + if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() || + MI->isInlineAsm()) + return false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + if (MO.isDef()) { + if (i == 0 && !MO.isImplicit() && !MO.isDead()) + DefedReg = MO.getReg(); + else + return false; + } else if (MO.getReg() && MO.getReg() != FrameReg) + return false; + } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() || + MO.isGlobal() || MO.isSymbol())) + return false; + } + return DefedReg.isValid(); +} + +bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { + bool Changed = false; + + Reg2DefMap &MBBDefs = RegDefs[MBB->getNumber()]; + + // Find reusable definitions in the predecessor(s). + if (!MBB->pred_empty()) { + MachineBasicBlock *FirstPred = *MBB->pred_begin(); + for (auto [Reg, DefMI] : RegDefs[FirstPred->getNumber()]) + if (llvm::all_of(drop_begin(MBB->predecessors()), + [&](const MachineBasicBlock *Pred) { + auto PredDefI = RegDefs[Pred->getNumber()].find(Reg); + return PredDefI != RegDefs[Pred->getNumber()].end() && + DefMI->isIdenticalTo(*PredDefI->second); + })) { + MBBDefs[Reg] = DefMI; + LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in " + << printMBBReference(*MBB) << ": " << *DefMI;); + } + } + + // Process MBB. + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + Register FrameReg = TRI->getFrameRegister(*MF); + for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { + // If FrameReg is modified, no previous load-address instructions are valid. + if (MI.modifiesRegister(FrameReg, TRI)) { + MBBDefs.clear(); + continue; + } + + Register DefedReg; + bool IsCandidate = isCandidate(&MI, DefedReg, FrameReg); + + // Check for an earlier identical and reusable instruction. + if (IsCandidate) { + auto DefI = MBBDefs.find(DefedReg); + if (DefI != MBBDefs.end() && MI.isIdenticalTo(*DefI->second)) { + LLVM_DEBUG(dbgs() << "Removing redundant instruction in " + << printMBBReference(*MBB) << ": " << MI;); + removeRedundantDef(&MI, TRI); + Changed = true; + continue; + } + } + + // Clear any entries in map that MI clobbers. + for (auto DefI = MBBDefs.begin(); DefI != MBBDefs.end();) { + Register Reg = DefI->first; + if (MI.modifiesRegister(Reg, TRI)) + DefI = MBBDefs.erase(DefI); + else + ++DefI; + } + + // Record this MI for potential later reuse. + if (IsCandidate) { + LLVM_DEBUG(dbgs() << "Found interesting instruction in " + << printMBBReference(*MBB) << ": " << MI;); + MBBDefs[DefedReg] = &MI; + } + } + + return Changed; +} Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1520,6 +1520,9 @@ /// Add passes that optimize machine instructions after register allocation. void TargetPassConfig::addMachineLateOptimization() { + // Cleanup of redundant immediate/address loads. + addPass(&MachineLateInstrsCleanupID); + // Branch folding must be run after regalloc and prolog/epilog insertion. addPass(&BranchFolderPassID); Index: llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -300,6 +300,7 @@ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); + disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&TailDuplicateID); disablePass(&StackMapLivenessID); Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -500,6 +500,7 @@ // them. // These functions all require the NoVRegs property. + disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&PostRAMachineSinkingID); disablePass(&PostRASchedulerID); Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -183,6 +183,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication Index: llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll =================================================================== --- llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll +++ llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll @@ -29,14 +29,8 @@ ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x9, [x9] ; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: Lloh6: -; CHECK-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE ; CHECK-NEXT: stur x9, [x29, #-8] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF] ; CHECK-NEXT: ldur x9, [x29, #-8] -; CHECK-NEXT: Lloh8: -; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %entry @@ -46,7 +40,6 @@ ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %entry ; CHECK-NEXT: bl ___stack_chk_fail -; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh6, Lloh7, Lloh8 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4 entry: Index: llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -59,26 +59,23 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w1, #1 ; CHECK-NEXT: mov w2, #2 -; CHECK-NEXT: st1d { z16.d }, p0, [x9] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w3, #3 ; CHECK-NEXT: mov w4, #4 ; CHECK-NEXT: mov w5, #5 ; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w7, #7 -; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1d { z19.d }, p0, [x9, #3, mul vl] ; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl callee2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -157,8 +157,6 @@ ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 ; FLATSCR-NEXT: s_mov_b32 s11, 0 ; FLATSCR-NEXT: s_mov_b32 s10, 0 ; FLATSCR-NEXT: s_mov_b32 s9, 0 @@ -171,8 +169,8 @@ ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_mov_b32 s3, 0 ; FLATSCR-NEXT: s_mov_b32 s2, 0 -; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 -; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:24 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:32 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:40 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1354,7 +1354,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz .LBB13_2 Index: llvm/test/CodeGen/AMDGPU/cc-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cc-update.ll +++ llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -537,7 +537,6 @@ ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND -; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -554,7 +553,6 @@ ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x40000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -569,8 +567,6 @@ ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill -; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload @@ -585,7 +581,6 @@ ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill -; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: ;;#ASMSTART ; GFX1100-NEXT: ;;#ASMEND ; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload Index: llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -76,12 +76,10 @@ ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: ; implicit-def: $sgpr16_sgpr17 ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -22,7 +22,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 vcc_lo, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52 @@ -88,7 +87,6 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 @@ -131,7 +129,6 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:52 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:36 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 @@ -980,7 +977,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 vcc_lo, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260 @@ -1053,7 +1049,6 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 @@ -1100,7 +1095,6 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:260 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:276 Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -372,6 +372,7 @@ ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Shrink Wrapping analysis ; GCN-O1-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-NEXT: Control Flow Optimizer ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Tail Duplication @@ -667,6 +668,7 @@ ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Shrink Wrapping analysis ; GCN-O1-OPTS-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O1-OPTS-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-OPTS-NEXT: Control Flow Optimizer ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Tail Duplication @@ -964,6 +966,7 @@ ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Shrink Wrapping analysis ; GCN-O2-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O2-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O2-NEXT: Control Flow Optimizer ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Tail Duplication @@ -1274,6 +1277,7 @@ ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Shrink Wrapping analysis ; GCN-O3-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; GCN-O3-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O3-NEXT: Control Flow Optimizer ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Tail Duplication Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -188,7 +188,6 @@ ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 Index: llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -187,8 +187,6 @@ ; SI-NEXT: s_branch .LBB3_3 ; SI-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_mov_b64 s[12:13], -1 -; SI-NEXT: s_mov_b64 s[14:15], -1 ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_and_b64 vcc, exec, s[14:15] @@ -206,7 +204,6 @@ ; SI-NEXT: s_cbranch_vccz .LBB3_1 ; SI-NEXT: ; %bb.5: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 s[14:15], -1 ; SI-NEXT: s_mov_b64 vcc, s[6:7] ; SI-NEXT: s_cbranch_vccz .LBB3_7 ; SI-NEXT: ; %bb.6: ; %if.else @@ -263,8 +260,6 @@ ; FLAT-NEXT: s_branch .LBB3_3 ; FLAT-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_mov_b64 s[8:9], 0 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 -; FLAT-NEXT: s_mov_b64 s[14:15], -1 ; FLAT-NEXT: .LBB3_2: ; %Flow ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] @@ -282,7 +277,6 @@ ; FLAT-NEXT: s_cbranch_vccz .LBB3_1 ; FLAT-NEXT: ; %bb.5: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 s[14:15], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[6:7] ; FLAT-NEXT: s_cbranch_vccz .LBB3_7 ; FLAT-NEXT: ; %bb.6: ; %if.else Index: llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -60,7 +60,6 @@ ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: .LBB0_4: ; %Flow3 ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec @@ -103,7 +102,6 @@ ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 -; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 entry: Index: llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -34,7 +34,6 @@ ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_movk_i32 s0, 0xffc ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 @@ -71,7 +70,6 @@ ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: s_mov_b32 s4, 0x40000 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -90,7 +88,6 @@ ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 @@ -237,7 +234,6 @@ ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_movk_i32 s8, 0x1004 ; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -320,7 +316,6 @@ ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -367,7 +362,6 @@ ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload @@ -391,7 +385,6 @@ ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0xffc ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10551,7 +10551,6 @@ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b32 s2, 0x84800 ; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10796,7 +10795,7 @@ ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 +; GFX9-FLATSCR-NEXT: s_nop 0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload @@ -11032,7 +11031,6 @@ ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67 Index: llvm/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -148,6 +148,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication Index: llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -1652,7 +1652,6 @@ ; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-ENABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: movs r1, #0 ; THUMB-ENABLE-NEXT: mov r2, r0 ; THUMB-ENABLE-NEXT: b LBB11_3 @@ -1679,7 +1678,6 @@ ; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-DISABLE-NEXT: @ %bb.1: @ %loop2a.preheader -; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: movs r1, #0 ; THUMB-DISABLE-NEXT: mov r2, r0 ; THUMB-DISABLE-NEXT: b LBB11_3 Index: llvm/test/CodeGen/ARM/fpclamptosat.ll =================================================================== --- llvm/test/CodeGen/ARM/fpclamptosat.ll +++ llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -3764,7 +3764,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB48_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI48_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB48_21 @@ -4347,7 +4346,6 @@ ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB51_19: @ %entry -; SOFT-NEXT: ldr r0, .LCPI51_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB51_21 Index: llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll =================================================================== --- llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -22,7 +22,7 @@ ; for.body -> for.cond.backedge (100%) ; -> cond.false.i (0%) ; CHECK: bb.1.for.body: -; CHECK: successors: %bb.2(0x80000000), %bb.4(0x00000000) +; CHECK: successors: %bb.2(0x80000000), %bb.5(0x00000000) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 Index: llvm/test/CodeGen/ARM/jump-table-islands.ll =================================================================== --- llvm/test/CodeGen/ARM/jump-table-islands.ll +++ llvm/test/CodeGen/ARM/jump-table-islands.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s -%BigInt = type i5500 +%BigInt = type i8500 define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) { ; CHECK-LABEL: test_moved_jumptable: Index: llvm/test/CodeGen/ARM/reg_sequence.ll =================================================================== --- llvm/test/CodeGen/ARM/reg_sequence.ll +++ llvm/test/CodeGen/ARM/reg_sequence.ll @@ -283,7 +283,6 @@ ; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: bxne lr ; CHECK-NEXT: LBB9_1: ; CHECK-NEXT: trap Index: llvm/test/CodeGen/BPF/objdump_cond_op_2.ll =================================================================== --- llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -14,9 +14,8 @@ ;