Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -31,11 +31,13 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/Support/Allocator.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -65,12 +67,18 @@ static char ID; ARMLoadStoreOpt() : MachineFunctionPass(ID) {} + const MachineFunction *MF; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + const MachineRegisterInfo *MRI; const ARMSubtarget *STI; const TargetLowering *TL; ARMFunctionInfo *AFI; - RegScavenger *RS; + LivePhysRegs LiveRegs; + RegisterClassInfo RegClassInfo; + MachineBasicBlock::const_iterator LiveRegPos; + bool LiveRegsValid; + bool RegClassInfoValid; bool isThumb1, isThumb2; bool runOnMachineFunction(MachineFunction &Fn) override; @@ -80,64 +88,50 @@ } private: + /// A set of MachineInstrs with same base address sorted by offset. struct MemOpQueueEntry { - int Offset; - unsigned Reg; - bool isKill; - unsigned Position; - MachineBasicBlock::iterator MBBI; - bool Merged; - MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, - MachineBasicBlock::iterator i) - : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} + MachineInstr *MI; + int Offset; ///< Load/Store offset. + unsigned Position; ///< Position as counted from end of basic block. + MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position) + : MI(MI), Offset(Offset), Position(Position) {} }; typedef SmallVector MemOpQueue; - typedef MemOpQueue::iterator MemOpQueueIter; - void findUsesOfImpDef(SmallVectorImpl &UsesOfImpDefs, - const MemOpQueue &MemOps, unsigned DefReg, - unsigned RangeBegin, unsigned RangeEnd); + /// A set of MachineInstrs that fulfill (nearly all) conditions to get + /// merged into a LDM/STM. + struct MergeCandidate { + /// List of instructions ordered by load/store offset. + SmallVector Instrs; + /// Index in Instrs of the instruction being latest in the schedule. + unsigned LatestMIIdx; + /// Index in Instrs of the instruction being earliest in the schedule. + unsigned EarliestMIIdx; + /// Index into the basic block where the merged instruction will be + /// inserted. (See MemOpQueueEntry.Position) + unsigned InsertPos; + }; + BumpPtrAllocator Allocator; + SmallVector Candidates; + + void moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before); + unsigned findFreeReg(const TargetRegisterClass &RegClass); void UpdateBaseRegUses(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc dl, unsigned Base, unsigned WordOffset, + DebugLoc DL, unsigned Base, unsigned WordOffset, ARMCC::CondCodes Pred, unsigned PredReg); - bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - int Offset, unsigned Base, bool BaseKill, unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, - DebugLoc dl, - ArrayRef > Regs, - ArrayRef ImpDefs); - void MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &MemOps, - unsigned memOpsBegin, - unsigned memOpsEnd, - unsigned insertAfter, - int Offset, - unsigned Base, - bool BaseKill, - unsigned Opcode, - ARMCC::CondCodes Pred, - unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVectorImpl &Merges); - void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, - unsigned Opcode, unsigned Size, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, MemOpQueue &MemOps, - SmallVectorImpl &Merges); - void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); + MachineInstr *MergeOps(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, + unsigned Base, bool BaseKill, unsigned Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL, + ArrayRef> Regs); + void FormCandidates(const MemOpQueue &MemOps); + MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); - bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I); - bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I); + bool MergeBaseUpdateLoadStore(MachineInstr *MI); + bool MergeBaseUpdateLSMultiple(MachineInstr *MI); bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -185,6 +179,14 @@ return Offset; } +static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) { + return MI.getOperand(1); +} + +static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) { + return MI.getOperand(0); +} + static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) { switch (Opcode) { default: llvm_unreachable("Unhandled opcode!"); @@ -348,6 +350,10 @@ return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); } +static bool isLoad(unsigned Opc) { + return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; +} + static unsigned getImmScale(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Unhandled opcode!"); @@ -365,12 +371,55 @@ } } +static unsigned getLSMultipleTransferSize(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDRi12: + case ARM::STRi12: + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + case ARM::VLDRS: + case ARM::VSTRS: + return 4; + case ARM::VLDRD: + case ARM::VSTRD: + return 8; + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::VLDMSIA: + case ARM::VSTMSIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; + case ARM::VLDMDIA: + case ARM::VSTMDIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; + } +} + /// Update future uses of the base register with the offset introduced /// due to writeback. This function only works on Thumb1. void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc dl, unsigned Base, + DebugLoc DL, unsigned Base, unsigned WordOffset, ARMCC::CondCodes Pred, unsigned PredReg) { assert(isThumb1 && "Can only update base register uses for Thumb1!"); @@ -398,7 +447,7 @@ Offset = MO.getImm() - WordOffset * getImmScale(Opc); // If storing the base register, it needs to be reset first. - unsigned InstrSrcReg = MBBI->getOperand(0).getReg(); + unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) MO.setImm(Offset); @@ -439,7 +488,7 @@ if (InsertSub) { // An instruction above couldn't be updated, so insert a sub. - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true) + AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); return; } @@ -457,31 +506,67 @@ // See PR21029. if (MBBI != MBB.end()) --MBBI; AddDefaultT1CC( - BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true) + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); } } +/// Return the first register of class \p RegClass that is not in \p Regs. +unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) { + if (!RegClassInfoValid) { + RegClassInfo.runOnMachineFunction(*MF); + RegClassInfoValid = true; + } + + for (unsigned Reg : RegClassInfo.getOrder(&RegClass)) + if (!LiveRegs.contains(Reg)) + return Reg; + return 0; +} + +/// Compute live registers just before instruction \p Before (in normal schedule +/// direction). Computes backwards so multiple queries in the same block must +/// come in reverse order. +void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before) { + // Initialize if we never queried in this block. + if (!LiveRegsValid) { + LiveRegs.init(TRI); + LiveRegs.addLiveOuts(&MBB, true); + LiveRegPos = MBB.end(); + LiveRegsValid = true; + } + // Move backward just before the "Before" position. + while (LiveRegPos != Before) { + --LiveRegPos; + LiveRegs.stepBackward(*LiveRegPos); + } +} + +static bool ContainsReg(const ArrayRef> &Regs, + unsigned Reg) { + for (const std::pair &R : Regs) + if (R.first == Reg) + return true; + return false; +} + /// Create and insert a LDM or STM with Base as base register and registers in /// Regs as the register operands that would be loaded / stored. It returns /// true if the transformation is done. -bool +MachineInstr * ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - int Offset, unsigned Base, bool BaseKill, - unsigned Opcode, ARMCC::CondCodes Pred, - unsigned PredReg, unsigned Scratch, DebugLoc dl, - ArrayRef > Regs, - ArrayRef ImpDefs) { - // Only a single register to load / store. Don't bother. + MachineBasicBlock::iterator InsertBefore, int Offset, + unsigned Base, bool BaseKill, unsigned Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL, + ArrayRef> Regs) { unsigned NumRegs = Regs.size(); - if (NumRegs <= 1) - return false; + assert(NumRegs > 1); // For Thumb1 targets, it might be necessary to clobber the CPSR to merge. // Compute liveness information for that register to make the decision. bool SafeToClobberCPSR = !isThumb1 || - (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) == + (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) == MachineBasicBlock::LQR_Dead); bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. @@ -489,17 +574,14 @@ // Exception: If the base register is in the input reglist, Thumb1 LDM is // non-writeback. // It's also not possible to merge an STR of the base register in Thumb1. - if (isThumb1) - for (const std::pair &R : Regs) - if (Base == R.first) { - assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); - if (Opcode == ARM::tLDRi) { - Writeback = false; - break; - } else if (Opcode == ARM::tSTRi) { - return false; - } - } + if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) { + assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); + if (Opcode == ARM::tLDRi) { + Writeback = false; + } else if (Opcode == ARM::tSTRi) { + return nullptr; + } + } ARM_AM::AMSubMode Mode = ARM_AM::ia; // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. @@ -516,18 +598,18 @@ } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { // Check if this is a supported opcode before inserting instructions to // calculate a new base register. - if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; + if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr; // If starting offset isn't zero, insert a MI to materialize a new base. // But only do so if it is cost effective, i.e. merging more than two // loads / stores. if (NumRegs <= 2) - return false; + return nullptr; // On Thumb1, it's not worth materializing a new base register without // clobbering the CPSR (i.e. not using ADDS/SUBS). if (!SafeToClobberCPSR) - return false; + return nullptr; unsigned NewBase; if (isi32Load(Opcode)) { @@ -535,10 +617,17 @@ // use as the new base. NewBase = Regs[NumRegs-1].first; } else { - // Use the scratch register to use as a new base. - NewBase = Scratch; + // Find a free register that we can use as scratch register. + moveLiveRegsBefore(MBB, InsertBefore); + // The merged instruction does not exist yet but will use several Regs if + // it is a Store. + if (!isLoad(Opcode)) + for (const std::pair &R : Regs) + LiveRegs.addReg(R.first); + + NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass); if (NewBase == 0) - return false; + return nullptr; } int BaseOpc = @@ -557,7 +646,12 @@ if (!TL->isLegalAddImmediate(Offset)) // FIXME: Try add with register operand? - return false; // Probably not worth it then. + return nullptr; // Probably not worth it then. + + // We can only append a kill flag to the add/sub input if the value is not + // used in the register list of the stm as well. + bool KillOldBase = BaseKill && + (!isi32Store(Opcode) || !ContainsReg(Regs, Base)); if (isThumb1) { // Thumb1: depending on immediate size, use either @@ -572,43 +666,43 @@ !STI->hasV6Ops()) { // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr if (Pred != ARMCC::AL) - return false; - BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase) - .addReg(Base, getKillRegState(BaseKill)); + return nullptr; + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)); } else - BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase) - .addReg(Base, getKillRegState(BaseKill)) + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)) .addImm(Pred).addReg(PredReg); - // Set up BaseKill and Base correctly to insert the ADDS/SUBS below. + // The following ADDS/SUBS becomes an update. Base = NewBase; - BaseKill = false; + KillOldBase = true; } if (BaseOpc == ARM::tADDrSPi) { assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4"); - BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4) + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4) .addImm(Pred).addReg(PredReg); } else - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + AddDefaultT1CC( + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) .addImm(Pred).addReg(PredReg); } else { - BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) .addImm(Pred).addReg(PredReg).addReg(0); } Base = NewBase; BaseKill = true; // New base is always killed straight away. } - bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || - Opcode == ARM::VLDRD); + bool isDef = isLoad(Opcode); // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with // base register writeback. Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); - if (!Opcode) return false; + if (!Opcode) return nullptr; // Check if a Thumb1 LDM/STM merge is safe. This is the case if: // - There is no writeback (LDM of base register), @@ -619,7 +713,7 @@ // It's safe to return here since the code to materialize a new base register // above is also conditional on SafeToClobberCPSR. if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill) - return false; + return nullptr; MachineInstrBuilder MIB; @@ -628,7 +722,7 @@ // Update tLDMIA with writeback if necessary. Opcode = ARM::tLDMIA_UPD; - MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); // Thumb1: we might need to set base writeback when building the MI. MIB.addReg(Base, getDefRegState(true)) @@ -637,214 +731,153 @@ // The base isn't dead after a merged instruction with writeback. // Insert a sub instruction after the newly formed instruction to reset. if (!BaseKill) - UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg); + UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg); } else { // No writeback, simply build the MachineInstr. - MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); MIB.addReg(Base, getKillRegState(BaseKill)); } MIB.addImm(Pred).addReg(PredReg); for (const std::pair &R : Regs) - MIB = MIB.addReg(R.first, getDefRegState(isDef) - | getKillRegState(R.second)); - - // Add implicit defs for super-registers. - for (unsigned ImpDef : ImpDefs) - MIB.addReg(ImpDef, RegState::ImplicitDefine); + MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second)); - return true; -} - -/// Find all instructions using a given imp-def within a range. -/// -/// We are trying to combine a range of instructions, one of which (located at -/// position RangeBegin) implicitly defines a register. The final LDM/STM will -/// be placed at RangeEnd, and so any uses of this definition between RangeStart -/// and RangeEnd must be modified to use an undefined value. -/// -/// The live range continues until we find a second definition or one of the -/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so -/// we must consider all uses and decide which are relevant in a second pass. -void ARMLoadStoreOpt::findUsesOfImpDef( - SmallVectorImpl &UsesOfImpDefs, const MemOpQueue &MemOps, - unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) { - std::map Uses; - unsigned LastLivePos = RangeEnd; - - // First we find all uses of this register with Position between RangeBegin - // and RangeEnd, any or all of these could be uses of a definition at - // RangeBegin. We also record the latest position a definition at RangeBegin - // would be considered live. - for (unsigned i = 0; i < MemOps.size(); ++i) { - MachineInstr &MI = *MemOps[i].MBBI; - unsigned MIPosition = MemOps[i].Position; - if (MIPosition <= RangeBegin || MIPosition > RangeEnd) - continue; - - // If this instruction defines the register, then any later use will be of - // that definition rather than ours. - if (MI.definesRegister(DefReg)) - LastLivePos = std::min(LastLivePos, MIPosition); - - MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg); - if (!UseOp) - continue; - - // If this instruction kills the register then (assuming liveness is - // correct when we start) we don't need to think about anything after here. - if (UseOp->isKill()) - LastLivePos = std::min(LastLivePos, MIPosition); - - Uses[MIPosition] = UseOp; - } - - // Now we traverse the list of all uses, and append the ones that actually use - // our definition to the requested list. - for (std::map::iterator I = Uses.begin(), - E = Uses.end(); - I != E; ++I) { - // List is sorted by position so once we've found one out of range there - // will be no more to consider. - if (I->first > LastLivePos) - break; - UsesOfImpDefs.push_back(I->second); - } + return MIB.getInstr(); } /// Call MergeOps and update MemOps and merges accordingly on success. -void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &memOps, - unsigned memOpsBegin, unsigned memOpsEnd, - unsigned insertAfter, int Offset, - unsigned Base, bool BaseKill, - unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVectorImpl &Merges) { - // First calculate which of the registers should be killed by the merged - // instruction. - const unsigned insertPos = memOps[insertAfter].Position; - SmallSet KilledRegs; - DenseMap Killer; - for (unsigned i = 0, e = memOps.size(); i != e; ++i) { - if (i == memOpsBegin) { - i = memOpsEnd; - if (i == e) - break; - } - if (memOps[i].Position < insertPos && memOps[i].isKill) { - unsigned Reg = memOps[i].Reg; +MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { + const MachineInstr *First = Cand.Instrs.front(); + unsigned Opcode = First->getOpcode(); + bool IsLoad = isLoad(Opcode); + SmallVector, 8> Regs; + SmallVector ImpDefs; + DenseSet KilledRegs; + // Determine list of registers and list of implicit super-register defs. + for (const MachineInstr *MI : Cand.Instrs) { + const MachineOperand &MO = getLoadStoreRegOp(*MI); + unsigned Reg = MO.getReg(); + bool IsKill = MO.isKill(); + if (IsKill) KilledRegs.insert(Reg); - Killer[Reg] = i; + Regs.push_back(std::make_pair(Reg, IsKill)); + + if (IsLoad) { + // Collect any implicit defs of super-registers, after merging we can't + // be sure anymore that we properly preserved these live ranges and must + // removed these implicit operands. + for (const MachineOperand &MO : MI->implicit_operands()) { + if (!MO.isReg() || !MO.isDef() || MO.isDead()) + continue; + assert(MO.isImplicit()); + unsigned DefReg = MO.getReg(); + + if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end()) + continue; + // We can ignore cases where the super-reg is read and written. + if (MI->readsRegister(DefReg)) + continue; + ImpDefs.push_back(DefReg); + } } } +#if 0 + // TODO FIXME: Look at this for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { MachineOperand &TransferOp = memOps[i].MBBI->getOperand(0); if (TransferOp.isUse() && TransferOp.getReg() == Base) BaseKill = false; } - - SmallVector, 8> Regs; - SmallVector ImpDefs; - SmallVector UsesOfImpDefs; - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - unsigned Reg = memOps[i].Reg; - // If we are inserting the merged operation after an operation that - // uses the same register, make sure to transfer any kill flag. - bool isKill = memOps[i].isKill || KilledRegs.count(Reg); - Regs.push_back(std::make_pair(Reg, isKill)); - - // Collect any implicit defs of super-registers. They must be preserved. - for (const MachineOperand &MO : memOps[i].MBBI->operands()) { - if (!MO.isReg() || !MO.isDef() || !MO.isImplicit() || MO.isDead()) - continue; - unsigned DefReg = MO.getReg(); - if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end()) - ImpDefs.push_back(DefReg); - - // There may be other uses of the definition between this instruction and - // the eventual LDM/STM position. These should be marked undef if the - // merge takes place. - findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position, - insertPos); - } +#endif + + // Attempt the merge. + typedef MachineBasicBlock::iterator iterator; + MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx]; + iterator InsertBefore = std::next(iterator(LatestMI)); + MachineBasicBlock &MBB = *LatestMI->getParent(); + unsigned Offset = getMemoryOpOffset(First); + unsigned Base = getLoadStoreBaseOp(*First).getReg(); + bool BaseKill = LatestMI->killsRegister(Base); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg); + DebugLoc DL = First->getDebugLoc(); + MachineInstr *Merged = MergeOps(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged) + return nullptr; + + // Determine earliest instruction that will get removed. We then keep an + // iterator just above it so the following erases don't invalidated it. + iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]); + bool EarliestAtBegin = false; + if (EarliestI == MBB.begin()) { + EarliestAtBegin = true; + } else { + EarliestI = std::prev(EarliestI); } - // Try to do the merge. - MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; - ++Loc; - if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, - Pred, PredReg, Scratch, dl, Regs, ImpDefs)) - return; - - // Merge succeeded, update records. - Merges.push_back(std::prev(Loc)); - - // In gathering loads together, we may have moved the imp-def of a register - // past one of its uses. This is OK, since we know better than the rest of - // LLVM what's OK with ARM loads and stores; but we still have to adjust the - // affected uses. - for (SmallVectorImpl::iterator I = UsesOfImpDefs.begin(), - E = UsesOfImpDefs.end(); - I != E; ++I) - (*I)->setIsUndef(); + // Remove instructions which have been merged. + for (MachineInstr *MI : Cand.Instrs) + MBB.erase(MI); + + // Determine range between the earliest removed instruction and the new one. + if (EarliestAtBegin) + EarliestI = MBB.begin(); + else + EarliestI = std::next(EarliestI); + auto FixupRange = make_range(EarliestI, iterator(Merged)); + + if (isLoad(Opcode)) { + // If the previous loads defined a super-reg, then we have to mark earlier + // operands undef; Replicate the super-reg def on the merged instruction. + for (MachineInstr &MI : FixupRange) { + for (unsigned &ImpDefReg : ImpDefs) { + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg() || MO.getReg() != ImpDefReg) + continue; + if (MO.readsReg()) + MO.setIsUndef(); + else if (MO.isDef()) + ImpDefReg = 0; + } + } + } - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - // Remove kill flags from any memops that come before insertPos. - if (Regs[i-memOpsBegin].second) { - unsigned Reg = Regs[i-memOpsBegin].first; - if (KilledRegs.count(Reg)) { - unsigned j = Killer[Reg]; - int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); - assert(Idx >= 0 && "Cannot find killing operand"); - memOps[j].MBBI->getOperand(Idx).setIsKill(false); - memOps[j].isKill = false; + MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged); + for (unsigned ImpDef : ImpDefs) + MIB.addReg(ImpDef, RegState::ImplicitDefine); + } else { + // Remove kill flags: We are possibly storing the values later now. + assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD); + for (MachineInstr &MI : FixupRange) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.isKill()) + continue; + if (KilledRegs.count(MO.getReg())) + MO.setIsKill(false); } - memOps[i].isKill = true; } - MBB.erase(memOps[i].MBBI); - // Update this memop to refer to the merged instruction. - // We may need to move kill flags again. - memOps[i].Merged = true; - memOps[i].MBBI = Merges.back(); - memOps[i].Position = insertPos; + assert(ImpDefs.empty()); } - // Update memOps offsets, since they may have been modified by MergeOps. - for (auto &MemOp : memOps) { - MemOp.Offset = getMemoryOpOffset(MemOp.MBBI); - } + return Merged; } -/// Merge a number of load / store instructions into one or more load / store -/// multiple instructions. -void -ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, - unsigned Base, unsigned Opcode, unsigned Size, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, MemOpQueue &MemOps, - SmallVectorImpl &Merges) { +/// Find candidates for load/store multiple merge in list of MemOpQueueEntries. +void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { + const MachineInstr *FirstMI = MemOps[0].MI; + unsigned Opcode = FirstMI->getOpcode(); bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); - int Offset = MemOps[SIndex].Offset; - int SOffset = Offset; - unsigned insertAfter = SIndex; - MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; - DebugLoc dl = Loc->getDebugLoc(); - const MachineOperand &PMO = Loc->getOperand(0); - unsigned PReg = PMO.getReg(); - unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); - unsigned Count = 1; - unsigned Limit = ~0U; - bool BaseKill = false; + unsigned Size = getLSMultipleTransferSize(FirstMI); // vldm / vstm limit are 32 for S variants, 16 for D variants. - + unsigned Limit; switch (Opcode) { - default: break; + default: + Limit = UINT_MAX; + break; case ARM::VSTRS: Limit = 32; break; @@ -859,46 +892,64 @@ break; } - for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { - int NewOffset = MemOps[i].Offset; - const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); - unsigned Reg = MO.getReg(); - unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); - // Register numbers must be in ascending order. For VFP / NEON load and - // store multiples, the registers must also be consecutive and within the - // limit on the number of registers per instruction. - if (Reg != ARM::SP && - NewOffset == Offset + (int)Size && - ((isNotVFP && RegNum > PRegNum) || - ((Count < Limit) && RegNum == PRegNum+1)) && - // On Swift we don't want vldm/vstm to start with a odd register num - // because Q register unaligned vldm/vstm need more uops. - (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { + unsigned SIndex = 0; + unsigned EIndex = MemOps.size(); + do { + // Look at the first instruction. + const MachineInstr *MI = MemOps[SIndex].MI; + int Offset = MemOps[SIndex].Offset; + const MachineOperand &PMO = getLoadStoreRegOp(*MI); + unsigned PReg = PMO.getReg(); + unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); + unsigned Latest = SIndex; + unsigned Earliest = SIndex; + unsigned Count = 1; + + // Merge additional instructions fulfilling LDM/STM constraints. + for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { + int NewOffset = MemOps[I].Offset; + if (NewOffset != Offset + (int)Size) + break; + const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); + unsigned Reg = MO.getReg(); + if (Reg == ARM::SP) + break; + unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); + // Register numbers must be in ascending order. + if (RegNum <= PRegNum) + break; + // For VFP / NEON load/store multiples, the registers must be consecutive + // and within the limit on the number of registers per instruction. + if (!isNotVFP && RegNum != PRegNum+1) + break; + // On Swift we don't want vldm/vstm to start with a odd register num + // because Q register unaligned vldm/vstm need more uops. + if (!isNotVFP && STI->isSwift() && Count == 1 && (PRegNum % 2) == 1) + break; + + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. Offset += Size; PRegNum = RegNum; - ++Count; - } else { - // Can't merge this in. Try merge the earlier ones first. - // We need to compute BaseKill here because the MemOps may have been - // reordered. - BaseKill = Loc->killsRegister(Base); - - MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base, - BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); - MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, - MemOps, Merges); - return; } - if (MemOps[i].Position > MemOps[insertAfter].Position) { - insertAfter = i; - Loc = MemOps[i].MBBI; - } - } - - BaseKill = Loc->killsRegister(Base); - MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, - Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); + // Form a candidate from the Ops collected so far. + MergeCandidate *Candidate = new(Allocator) MergeCandidate; + for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) + Candidate->Instrs.push_back(MemOps[C].MI); + Candidate->LatestMIIdx = Latest - SIndex; + Candidate->EarliestMIIdx = Earliest - SIndex; + Candidate->InsertPos = MemOps[Latest].Position; + Candidates.push_back(Candidate); + // Continue after the chain. + SIndex += Count; + } while (SIndex < EIndex); } static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, @@ -971,49 +1022,6 @@ return CheckCPSRDef ? !definesCPSR(MI) : true; } -static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { - switch (MI->getOpcode()) { - default: return 0; - case ARM::LDRi12: - case ARM::STRi12: - case ARM::tLDRi: - case ARM::tSTRi: - case ARM::tLDRspi: - case ARM::tSTRspi: - case ARM::t2LDRi8: - case ARM::t2LDRi12: - case ARM::t2STRi8: - case ARM::t2STRi12: - case ARM::VLDRS: - case ARM::VSTRS: - return 4; - case ARM::VLDRD: - case ARM::VSTRD: - return 8; - case ARM::LDMIA: - case ARM::LDMDA: - case ARM::LDMDB: - case ARM::LDMIB: - case ARM::STMIA: - case ARM::STMDA: - case ARM::STMDB: - case ARM::STMIB: - case ARM::tLDMIA: - case ARM::tLDMIA_UPD: - case ARM::tSTMIA_UPD: - case ARM::t2LDMIA: - case ARM::t2LDMDB: - case ARM::t2STMIA: - case ARM::t2STMDB: - case ARM::VLDMSIA: - case ARM::VSTMSIA: - return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; - case ARM::VLDMDIA: - case ARM::VSTMDIA: - return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; - } -} - static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, ARM_AM::AMSubMode Mode) { switch (Opc) { @@ -1093,21 +1101,18 @@ /// ldmia rn, /// => /// ldmdb rn!, -bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I) { +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // Thumb1 is already using updating loads/stores. if (isThumb1) return false; - MachineInstr *MI = MBBI; - unsigned Base = MI->getOperand(0).getReg(); - bool BaseKill = MI->getOperand(0).isKill(); + const MachineOperand &BaseOP = MI->getOperand(0); + unsigned Base = BaseOP.getReg(); + bool BaseKill = BaseOP.isKill(); unsigned Bytes = getLSMultipleTransferSize(MI); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); unsigned Opcode = MI->getOpcode(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc DL = MI->getDebugLoc(); // Can't use an updating ld/st if the base register is also a dest // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. @@ -1119,7 +1124,9 @@ ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); // Try merging with the previous instruction. + MachineBasicBlock &MBB = *MI->getParent(); MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator MBBI(MI); if (MBBI != BeginMBBI) { MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) @@ -1150,20 +1157,15 @@ isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { DoMerge = true; } - if (DoMerge) { - if (NextMBBI == I) { - Advance = true; - ++I; - } + if (DoMerge) MBB.erase(NextMBBI); - } } if (!DoMerge) return false; unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(BaseKill)) .addImm(Pred).addReg(PredReg); @@ -1231,21 +1233,16 @@ /// Fold proceeding/trailing inc/dec of base register into the /// LDR/STR/FLD{D|S}/FST{D|S} op when possible: -bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I) { +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // Thumb1 doesn't have updating LDR/STR. // FIXME: Use LDM/STM with single register instead. if (isThumb1) return false; - MachineInstr *MI = MBBI; - unsigned Base = MI->getOperand(1).getReg(); - bool BaseKill = MI->getOperand(1).isKill(); + unsigned Base = getLoadStoreBaseOp(*MI).getReg(); + bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); unsigned Bytes = getLSMultipleTransferSize(MI); unsigned Opcode = MI->getOpcode(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc DL = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); @@ -1255,7 +1252,7 @@ if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) return false; - bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; + bool isLd = isLoad(Opcode); // Can't do the merge if the destination register is the same as the would-be // writeback register. if (MI->getOperand(0).getReg() == Base) @@ -1270,7 +1267,9 @@ unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); // Try merging with the previous instruction. + MachineBasicBlock &MBB = *MI->getParent(); MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator MBBI(MI); if (MBBI != BeginMBBI) { MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) @@ -1303,10 +1302,6 @@ } if (DoMerge) { NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); - if (NextMBBI == I) { - Advance = true; - ++I; - } MBB.erase(NextMBBI); } } @@ -1320,7 +1315,7 @@ // updating load/store-multiple instructions can be used with only one // register.) MachineOperand &MO = MI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(isLd ? BaseKill : false)) .addImm(Pred).addReg(PredReg) @@ -1331,19 +1326,19 @@ // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } else { int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); } } else { int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2LDR_PRE, t2LDR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } @@ -1355,13 +1350,13 @@ if (isAM2 && NewOpc == ARM::STR_POST_IMM) { int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); } else { int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2STR_PRE, t2STR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } @@ -1426,26 +1421,10 @@ return false; } -/// Advance register scavenger to just before the earliest memory op that is -/// being merged. -void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { - MachineBasicBlock::iterator Loc = MemOps[0].MBBI; - unsigned Position = MemOps[0].Position; - for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { - if (MemOps[i].Position < Position) { - Position = MemOps[i].Position; - Loc = MemOps[i].MBBI; - } - } - - if (Loc != MBB.begin()) - RS->forward(std::prev(Loc)); -} - static void InsertLDR_STR(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int Offset, bool isDef, - DebugLoc dl, unsigned NewOpc, + DebugLoc DL, unsigned NewOpc, unsigned Reg, bool RegDeadKill, bool RegUndef, unsigned BaseReg, bool BaseKill, bool BaseUndef, bool OffKill, bool OffUndef, @@ -1491,7 +1470,6 @@ if (!Errata602117 && !NonConsecutiveRegs) return false; - MachineBasicBlock::iterator NewBBI = MBBI; bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; bool EvenDeadKill = isLd ? @@ -1531,7 +1509,6 @@ getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); ++NumSTRD2STM; } - NewBBI = std::prev(MBBI); } else { // Split into two instructions. unsigned NewOpc = (isLd) @@ -1553,7 +1530,6 @@ OddReg, OddDeadKill, false, BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); - NewBBI = std::prev(MBBI); InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, false, BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, @@ -1573,7 +1549,6 @@ EvenReg, EvenDeadKill, EvenUndef, BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); - NewBBI = std::prev(MBBI); InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, OddReg, OddDeadKill, OddUndef, BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, @@ -1585,16 +1560,13 @@ ++NumSTRD2STR; } - MBB.erase(MI); - MBBI = NewBBI; + MBBI = MBB.erase(MBBI); return true; } /// An optimization pass to turn multiple LDR / STR ops of the same base and /// incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { - unsigned NumMerges = 0; - unsigned NumMemOps = 0; MemOpQueue MemOps; unsigned CurrBase = 0; unsigned CurrOpc = ~0u; @@ -1602,174 +1574,137 @@ ARMCC::CondCodes CurrPred = ARMCC::AL; unsigned CurrPredReg = 0; unsigned Position = 0; - SmallVector Merges; + assert(Candidates.size() == 0); + LiveRegsValid = false; - RS->enterBasicBlock(&MBB); - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - while (MBBI != E) { + for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); + I = MBBI) { + // The instruction in front of the iterator is the one we look at. + MBBI = std::prev(I); if (FixInvalidRegPairOp(MBB, MBBI)) continue; + ++Position; - bool Advance = false; - bool TryMerge = false; - - bool isMemOp = isMemoryOp(MBBI); - if (isMemOp) { + if (isMemoryOp(MBBI)) { unsigned Opcode = MBBI->getOpcode(); unsigned Size = getLSMultipleTransferSize(MBBI); const MachineOperand &MO = MBBI->getOperand(0); unsigned Reg = MO.getReg(); - bool isKill = MO.isDef() ? false : MO.isKill(); - unsigned Base = MBBI->getOperand(1).getReg(); + unsigned Base = getLoadStoreBaseOp(*MBBI).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); int Offset = getMemoryOpOffset(MBBI); - // Watch out for: - // r4 := ldr [r5] - // r5 := ldr [r5, #4] - // r6 := ldr [r5, #8] - // - // The second ldr has effectively broken the chain even though it - // looks like the later ldr(s) use the same base register. Try to - // merge the ldr's so far, including this one. But don't try to - // combine the following ldr(s). - bool Clobber = isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg(); - - // Watch out for: - // r4 := ldr [r0, #8] - // r4 := ldr [r0, #4] - // - // The optimization may reorder the second ldr in front of the first - // ldr, which violates write after write(WAW) dependence. The same as - // str. Try to merge inst(s) already in MemOps. - bool Overlap = false; - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { - if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) { - Overlap = true; - break; - } - } - - if (CurrBase == 0 && !Clobber) { + if (CurrBase == 0) { // Start of a new chain. CurrBase = Base; CurrOpc = Opcode; CurrSize = Size; CurrPred = Pred; CurrPredReg = PredReg; - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); - ++NumMemOps; - Advance = true; - } else if (!Overlap) { - if (Clobber) { - TryMerge = true; - Advance = true; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } + // Note: No need to match PredReg in the next if. + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // Watch out for: + // r4 := ldr [r0, #8] + // r4 := ldr [r0, #4] + // or + // r0 := ldr [r0] + // If a load overrides the base register or a register loaded by + // another load in our chain, we cannot take this instruction. + bool Overlap = false; + if (isLoad(Opcode)) { + Overlap = (Base == Reg); + if (!Overlap) { + for (const MemOpQueueEntry &E : MemOps) { + if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + } } - if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { - // No need to match PredReg. - // Continue adding to the queue. + if (!Overlap) { + // Check offset and sort memory operation into the current chain. if (Offset > MemOps.back().Offset) { - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; - Advance = true; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; } else { - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); - I != E; ++I) { - if (Offset < I->Offset) { - MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; - Advance = true; + MemOpQueue::iterator MI, ME; + for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) { + if (Offset < MI->Offset) { + // Found a place to insert. break; - } else if (Offset == I->Offset) { - // Collision! This can't be merged! + } + if (Offset == MI->Offset) { + // Collision, abort. + MI = ME; break; } } + if (MI != MemOps.end()) { + MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } } } } - } - - if (MBBI->isDebugValue()) { - ++MBBI; - if (MBBI == E) - // Reach the end of the block, try merging the memory instructions. - TryMerge = true; - } else if (Advance) { - ++Position; - ++MBBI; - if (MBBI == E) - // Reach the end of the block, try merging the memory instructions. - TryMerge = true; - } else { - TryMerge = true; - } - if (TryMerge) { - if (NumMemOps > 1) { - // Try to find a free register to use as a new base in case it's needed. - // First advance to the instruction just before the start of the chain. - AdvanceRS(MBB, MemOps); - - // Find a scratch register. - unsigned Scratch = - RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); - - // Process the load / store instructions. - RS->forward(std::prev(MBBI)); - - // Merge ops. - Merges.clear(); - MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, - CurrPred, CurrPredReg, Scratch, MemOps, Merges); - - // Try folding preceding/trailing base inc/dec into the generated - // LDM/STM ops. - for (unsigned i = 0, e = Merges.size(); i < e; ++i) - if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) - ++NumMerges; - NumMerges += Merges.size(); - - // Try folding preceding/trailing base inc/dec into those load/store - // that were not merged to form LDM/STM ops. - for (unsigned i = 0; i != NumMemOps; ++i) - if (!MemOps[i].Merged) - if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) - ++NumMerges; - - // RS may be pointing to an instruction that's deleted. - RS->skipTo(std::prev(MBBI)); - } else if (NumMemOps == 1) { - // Try folding preceding/trailing base inc/dec into the single - // load/store. - if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { - ++NumMerges; - RS->forward(std::prev(MBBI)); - } - } + // Don't advance the iterator; The op will start a new chain next. + MBBI = I; + --Position; + // Fallthrough to look into existing chain. + } else if (MBBI->isDebugValue()) + continue; + // If we are here then the chain is broken; Extract candidates for a merge. + if (MemOps.size() > 0) { + FormCandidates(MemOps); + // Reset for the next chain. CurrBase = 0; CurrOpc = ~0u; CurrSize = 0; CurrPred = ARMCC::AL; CurrPredReg = 0; - if (NumMemOps) { - MemOps.clear(); - NumMemOps = 0; - } + MemOps.clear(); + } + } + if (MemOps.size() > 0) + FormCandidates(MemOps); - // If iterator hasn't been advanced and this is not a memory op, skip it. - // It can't start a new chain anyway. - if (!Advance && !isMemOp && MBBI != E) { - ++Position; - ++MBBI; + // Sort candidates so they get processed from end to begin of the basic + // block later; This is necessary for liveness calculation. + auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { + return M0->InsertPos < M1->InsertPos; + }; + std::sort(Candidates.begin(), Candidates.end(), LessThan); + + // Go through list of candidates and merge. + bool Changed = false; + for (const MergeCandidate *Candidate : Candidates) { + if (Candidate->Instrs.size() > 1) { + MachineInstr *Merged = MergeOpsUpdate(*Candidate); + // Merge preceding/trailing base inc/dec into the merged op. + if (Merged) { + MergeBaseUpdateLSMultiple(Merged); + Changed = true; + } else { + for (MachineInstr *MI : Candidate->Instrs) { + if (MergeBaseUpdateLoadStore(MI)) + Changed = true; + } } + } else { + assert(Candidate->Instrs.size() == 1); + if (MergeBaseUpdateLoadStore(Candidate->Instrs.front())) + Changed = true; } } - return NumMerges > 0; + Candidates.clear(); + + return Changed; } /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr") @@ -1814,12 +1749,14 @@ } bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + MF = &Fn; STI = &static_cast(Fn.getSubtarget()); TL = STI->getTargetLowering(); AFI = Fn.getInfo(); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); - RS = new RegScavenger(); + MRI = &Fn.getRegInfo(); + RegClassInfoValid = false; isThumb2 = AFI->isThumb2Function(); isThumb1 = AFI->isThumbFunction() && !isThumb2; @@ -1832,7 +1769,7 @@ Modified |= MergeReturnIntoLDM(MBB); } - delete RS; + Allocator.Reset(); return Modified; } @@ -2219,7 +2156,7 @@ continue; int Opc = MI->getOpcode(); - bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; + bool isLd = isLoad(Opc); unsigned Base = MI->getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); Index: test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll =================================================================== --- test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll +++ test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll @@ -9,8 +9,8 @@ ) { ;CHECK: sub sp, sp, #16 ;CHECK: push {r11, lr} -;CHECK: add r11, sp, #8 -;CHECK: stm r11, {r0, r1, r2, r3} +;CHECK: add r12, sp, #8 +;CHECK: stm r12, {r0, r1, r2, r3} ;CHECK: add r0, sp, #12 ;CHECK: bl useInt ;CHECK: pop {r11, lr}