Index: lib/Target/NDS32/CMakeLists.txt =================================================================== --- lib/Target/NDS32/CMakeLists.txt +++ lib/Target/NDS32/CMakeLists.txt @@ -25,6 +25,7 @@ NDS32MCInstLower.cpp NDS32AsmPrinter.cpp NDS32TargetMachine.cpp + NDS32LoadStoreOptimizer.cpp ) add_subdirectory(TargetInfo) Index: lib/Target/NDS32/NDS32.h =================================================================== --- lib/Target/NDS32/NDS32.h +++ lib/Target/NDS32/NDS32.h @@ -25,6 +25,9 @@ FunctionPass *createNDS32ISelDag(NDS32TargetMachine &TM, CodeGenOpt::Level OptLevel); + FunctionPass *createNDS32LoadStoreOptimizationPass(bool PreAlloc = false); + + void initializeNDS32LoadStoreOptPass(PassRegistry &); } // end namespace llvm; #endif Index: lib/Target/NDS32/NDS32LoadStoreOptimizer.cpp =================================================================== --- /dev/null +++ lib/Target/NDS32/NDS32LoadStoreOptimizer.cpp @@ -0,0 +1,975 @@ +//===-- NDS32LoadStoreOptimizer.cpp - NDS32 load / store opt. pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a pass that performs load / store related peephole +/// optimizations. +// +//===----------------------------------------------------------------------===// + +#include "NDS32.h" +#include "NDS32InstrInfo.h" +#include "NDS32RegisterInfo.h" +#include "NDS32ISelLowering.h" +#include "NDS32MachineFunctionInfo.h" +#include "NDS32Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "nds32-ldst-opt" + +STATISTIC(NumLMW_BIMGened , "Number of lmw.bim instructions generated"); +STATISTIC(NumLMW_BDGened , "Number of lmw.bd instructions generated"); +STATISTIC(NumLMW_BIGened , "Number of lmw.bi instructions generated"); +STATISTIC(NumLMW_AIGened , "Number of lmw.ai instructions generated"); +STATISTIC(NumSMW_ADMGened , "Number of smw.adm instructions generated"); +STATISTIC(NumSMW_BDGened , "Number of smw.bd instructions generated"); +STATISTIC(NumSMW_BIGened , "Number of smw.bi instructions generated"); +STATISTIC(NumSMW_AIGened , "Number of smw.ai instructions generated"); +STATISTIC(NumSWI_BIGened , "Number of swi.bi instructions generated"); + +#define NDS32_LOAD_STORE_OPT_NAME "NDS32 load / store optimization pass" + +namespace { + struct NDS32LoadStoreOpt : public MachineFunctionPass { + static char ID; + NDS32LoadStoreOpt() : MachineFunctionPass(ID) {} + + const MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const NDS32Subtarget *STI; + const TargetLowering *TL; + NDS32MachineFunctionInfo *NFI; + LivePhysRegs LiveRegs; + RegisterClassInfo RegClassInfo; + MachineBasicBlock::const_iterator LiveRegPos; + bool LiveRegsValid; + bool RegClassInfoValid; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { return NDS32_LOAD_STORE_OPT_NAME; } + + private: + /// A set of load/store MachineInstrs with same base register. + struct MemOpQueueEntry { + MachineInstr *MI; + int Offset; ///< Load/Store offset. + unsigned Reg; ///< Load/Store reg + unsigned Position; ///< Position as counted from end of basic block. + MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Reg, + unsigned Position) + : MI(&MI), Offset(Offset), Reg(Reg), Position(Position) {} + }; + typedef SmallVector MemOpQueue; + + /// A set of MachineInstrs that fulfill (nearly all) conditions to get + /// merged into a LMW/SMW. + struct MergeCandidate { + /// List of instructions ordered by load/store offset. + SmallVector Instrs; + /// Index in Instrs of the instruction being latest in the schedule. + unsigned LatestMIIdx; + /// Index in Instrs of the instruction being earliest in the schedule. + unsigned EarliestMIIdx; + /// Index into the basic block where the merged instruction will be + /// inserted. (See MemOpQueueEntry.Position) + unsigned InsertPos; + /// Whether the instructions can be merged into a lmw/smw instruction. + bool CanMergeToLSMulti; + }; + SpecificBumpPtrAllocator Allocator; + SmallVector Candidates; + SmallVector MergeBaseCandidates; + + MachineInstr *CreateLoadStoreMulti( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + int BaseOffset, int OffsetDiff, unsigned Base, bool BaseKill, + unsigned Opcode, const DebugLoc &DL, + ArrayRef> Regs, + MachineInstr *EarliestMI, MachineInstr *LatestMI); + + MachineBasicBlock::iterator findIncDecBefore( + MachineBasicBlock::iterator MBBI, unsigned Reg, int &Offset); + + bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); + void FormCandidates(const MemOpQueue &MemOps); + bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + }; + char NDS32LoadStoreOpt::ID = 0; +} + +INITIALIZE_PASS(NDS32LoadStoreOpt, "nds32-ldst-opt", NDS32_LOAD_STORE_OPT_NAME, + false, false) + +static bool isLoadSingle(unsigned Opc) { + return Opc == NDS32::LWI || Opc == NDS32::LWI_BI || + Opc == NDS32::LWI37 || Opc == NDS32::LWI37SP; +} + +static bool isi32Store(unsigned Opc) { + return Opc == NDS32::SWI || Opc == NDS32::SWI_BI || + Opc == NDS32::SWI37 || Opc == NDS32::SWI37SP; +} + +static bool isi32StoreWithoutBI(unsigned Opc) { + return Opc == NDS32::SWI || + Opc == NDS32::SWI37 || Opc == NDS32::SWI37SP; +} + +static bool isi32LoadWithoutBI(unsigned Opc) { + return Opc == NDS32::LWI || + Opc == NDS32::LWI37 || Opc == NDS32::LWI37SP; +} + +static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) { + return MI.getOperand(0); +} + +static unsigned getLoadStoreBaseReg(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case NDS32::SWI37: + case NDS32::LWI37: + return NDS32::FP; + case NDS32::SWI37SP: + case NDS32::LWI37SP: + return NDS32::SP; + default: + break; + } + return MI.getOperand(1).getReg(); +} + +static int getMemoryOpOffset(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case NDS32::SWI_BI: + case NDS32::LWI_BI: + return MI.getOperand(3).getImm(); + default: + break; + } + return MI.getOperand(2).getImm(); +} + +/// Returns true if instruction is a memory operation that this pass is capable +/// of operating on. +static bool isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case NDS32::SWI: + case NDS32::LWI: + case NDS32::SWI_BI: + case NDS32::LWI_BI: + case NDS32::SWI37: + case NDS32::LWI37: + case NDS32::SWI37SP: + case NDS32::LWI37SP: + break; + default: + return false; + } + return true; +} + +/// Find candidates for load/store multiple merge in list of MemOpQueueEntries. +void NDS32LoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { + + unsigned SIndex = 0; + unsigned EIndex = MemOps.size(); + + do { + // Look at the first instruction. + int Offset = MemOps[SIndex].Offset; + int POffset = Offset; + unsigned Latest = SIndex; + unsigned Earliest = SIndex; + unsigned Count = 1; + + bool CanMergeToLSMulti = true; + bool DetectOffset = true; + bool IsOffsetInc = true; + + // Merge following instructions where possible. + for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { + Offset = MemOps[I].Offset; + + // Offset should be increment or decrement by 4 + if (DetectOffset) { + if (Offset + 4 == POffset) + IsOffsetInc = false; + else if (Offset - 4 == POffset) + IsOffsetInc = true; + else + break; + DetectOffset = false; + } + bool PartOfLSMulti = CanMergeToLSMulti; + if (PartOfLSMulti) { + // If previous Offset is increment by 4 but current Offset not. + if (!IsOffsetInc && (Offset + 4 != POffset)) + PartOfLSMulti = false; + // If previous Offset is decrement by 4 but current Offset not. + else if (IsOffsetInc && (Offset - 4 != POffset)) + PartOfLSMulti = false; + } + + if (!PartOfLSMulti) + break; + CanMergeToLSMulti &= PartOfLSMulti; + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. + POffset = Offset; + } + // Form a candidate from the Ops collected so far. + MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate; + for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) + Candidate->Instrs.push_back(MemOps[C].MI); + Candidate->LatestMIIdx = Latest - SIndex; + Candidate->EarliestMIIdx = Earliest - SIndex; + Candidate->InsertPos = MemOps[Latest].Position; + Candidate->CanMergeToLSMulti = CanMergeToLSMulti; + Candidates.push_back(Candidate); + // Continue after the chain. + SIndex += Count; + } while (SIndex < EIndex); +} + +/// Call MergeOps and update MemOps and merges accordingly on success. +MachineInstr *NDS32LoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { + const MachineInstr *First = Cand.Instrs.front(); + int BaseOffset = getMemoryOpOffset(*First); + unsigned Opcode = First->getOpcode(); + bool IsLoad = isLoadSingle(Opcode); + SmallVector, 32> Regs; + SmallVector ImpDefs; + DenseSet KilledRegs; + DenseSet UsedRegs; + int OffsetDiff = 0; + // Determine list of registers and list of implicit super-register defs. + for (const MachineInstr *MI : Cand.Instrs) { + const MachineOperand &MO = getLoadStoreRegOp(*MI); + int CurOffset = getMemoryOpOffset(*MI); + unsigned Reg = MO.getReg(); + if ((OffsetDiff == 0) && (BaseOffset != CurOffset)) { + OffsetDiff = CurOffset - BaseOffset; + } + + bool IsKill = MO.isKill(); + if (IsKill) + KilledRegs.insert(Reg); + UsedRegs.insert(Reg); + + Regs.push_back(std::make_pair(Reg, IsKill)); + + if (IsLoad) { + // Collect any implicit defs of super-registers, after merging we can't + // be sure anymore that we properly preserved these live ranges and must + // removed these implicit operands. + for (const MachineOperand &MO : MI->implicit_operands()) { + if (!MO.isReg() || !MO.isDef() || MO.isDead()) + continue; + assert(MO.isImplicit()); + unsigned DefReg = MO.getReg(); + + if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end()) + continue; + // We can ignore cases where the super-reg is read and written. + if (MI->readsRegister(DefReg)) + continue; + ImpDefs.push_back(DefReg); + } + } + } + // Attempt the merge. + typedef MachineBasicBlock::iterator iterator; + MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx]; + MachineInstr *EarliestMI = Cand.Instrs[Cand.EarliestMIIdx]; + iterator InsertBefore = std::next(iterator(LatestMI)); + MachineBasicBlock &MBB = *LatestMI->getParent(); + unsigned Base = getLoadStoreBaseReg(*First); + bool BaseKill = LatestMI->killsRegister(Base); + DebugLoc DL = First->getDebugLoc(); + MachineInstr *Merged = nullptr; + + if (!Merged && Cand.CanMergeToLSMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, BaseOffset, OffsetDiff, + Base, BaseKill, Opcode, DL, Regs, + EarliestMI, LatestMI); + if (!Merged) + return nullptr; + + // Determine earliest instruction that will get removed. We then keep an + // iterator just above it so the following erases don't invalidated it. + iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]); + bool EarliestAtBegin = false; + if (EarliestI == MBB.begin()) { + EarliestAtBegin = true; + } else { + EarliestI = std::prev(EarliestI); + } + + // Remove instructions which have been merged. + for (MachineInstr *MI : Cand.Instrs) + MBB.erase(MI); + + // Determine range between the earliest removed instruction and the new one. + if (EarliestAtBegin) + EarliestI = MBB.begin(); + else + EarliestI = std::next(EarliestI); + auto FixupRange = make_range(EarliestI, iterator(Merged)); + + if (isLoadSingle(Opcode)) { + // If the previous loads defined a super-reg, then we have to mark earlier + // operands undef; Replicate the super-reg def on the merged instruction. + for (MachineInstr &MI : FixupRange) { + for (unsigned &ImpDefReg : ImpDefs) { + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg() || MO.getReg() != ImpDefReg) + continue; + if (MO.readsReg()) + MO.setIsUndef(); + else if (MO.isDef()) + ImpDefReg = 0; + } + } + } + + MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged); + for (unsigned ImpDef : ImpDefs) + MIB.addReg(ImpDef, RegState::ImplicitDefine); + } else { + // Remove kill flags: We are possibly storing the values later now. + assert(isi32Store(Opcode)); + for (MachineInstr &MI : FixupRange) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.isKill()) + continue; + if (UsedRegs.count(MO.getReg())) + MO.setIsKill(false); + } + } + assert(ImpDefs.empty()); + } + + return Merged; +} + +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg) { + switch (MI.getOpcode()) { + case NDS32::ADDI: + case NDS32::ADDI333: + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg) + return 0; + else + return MI.getOperand(2).getImm(); + case NDS32::SUBI333: + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg) + return 0; + else + return -MI.getOperand(2).getImm(); + case NDS32::ADD45: + if (MI.getOperand(0).getReg() != Reg) + return 0; + else + return MI.getOperand(1).getImm(); + case NDS32::SUB45: + if (MI.getOperand(0).getReg() != Reg) + return 0; + else + return -MI.getOperand(1).getImm(); + default: + break; + } + + return 0; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +MachineBasicBlock::iterator NDS32LoadStoreOpt:: +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while ((PrevMBBI->isDebugValue() || PrevMBBI->isCFIInstruction()) + && PrevMBBI != BeginMBBI) + --PrevMBBI; + + if (PrevMBBI == BeginMBBI) return EndMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg); + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && + (NextMBBI->isDebugValue() || NextMBBI->isCFIInstruction())) + ++NextMBBI; + + if (NextMBBI == EndMBBI) return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + +bool NDS32LoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { + const MachineOperand &BaseOP = MI->getOperand(0); + unsigned Base = BaseOP.getReg(); + DebugLoc DL = MI->getDebugLoc(); + + // Can't use an updating ld/st if the base register is also a dest + // register. e.g. lmw.bim [r0], {r0, r1, r2}. The behavior is undefined. + for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) + if (MI->getOperand(i).getReg() == Base) + return false; + + // ToDo: Generate extra ADDI for multiple load/store if it's profitable. + + return false; +} + +static unsigned getLSMWOpcode(bool FindAdjustInsnBefore, + bool FindAdjustInsnAfter, int AdjustAmount, + int BaseOffset, int OffsetDiff, unsigned Opcode, + ArrayRef> Regs) { + unsigned PReg = 0; + int LastOffset = 0; + + // In NDS32 multiple load/store, the lowest-numbered register is loaded + // from the lowest memory address while the highest-numbered register + // is loaded from the highest memory address + // E.g. + // SWI R0, [Ra + 4] + // SWI R1, [Ra + 8] <- OffsetDiff = 4, PReg = R0 + // + // Checking the register number and offset here + for (const std::pair &R : Regs) { + // LP, GP, SP, FP will encoded in Enable4 field. + // These four registers don't have to consecutive with other registers. + switch (R.first){ + case NDS32::LP: case NDS32::GP: case NDS32::SP: case NDS32::FP: + continue; + default: + break; + } + + if (PReg == 0) { + PReg = R.first; + continue; + } + + if (PReg + 1 != R.first) + return 0; + PReg = R.first; + } + + LastOffset = BaseOffset + OffsetDiff * (Regs.size() - 1); + // E.g. SP = SP - 48 <- AdjustAmount + // SWI R0, [SP + 40] <- BaseOffset + // SWI LP, [SP + 44] <- LastOffset + // => + // SMW_ADM [SP], {LP, R0} + // SP = SP - 40 + if (FindAdjustInsnBefore && (LastOffset + AdjustAmount == -4) + && isi32Store(Opcode)) { + ++NumSMW_ADMGened; + return NDS32::SMW_ADM; + } + + // E.g + // SWI R2, [R1 + 0] <- BaseOffset + // ADDI R1, R1, -4 <- AdjustAmount + // => + // SWI_BI R2, [R1], -4 + if (FindAdjustInsnAfter && (BaseOffset + AdjustAmount == -4) + && isi32Store(Opcode) && (Regs.size () == 1)) { + ++NumSWI_BIGened; + return NDS32::SWI_BI; + } + + // E.g. + // LWI R0, [SP + 40] <- BaseOffset + // LWI LP, [SP + 44] <- LastOffset + // SP = SP + 48 <- AdjustAmount + // => + // SP = SP + 40 + // LMW_BIM [SP], {LP, R0} + if (FindAdjustInsnAfter && (LastOffset + 4 == AdjustAmount) + && isLoadSingle(Opcode)) { + ++NumLMW_BIMGened; + return NDS32::LMW_BIM; + } + + // Skip single load/store + if (Regs.size() == 1) + return 0; + + if (OffsetDiff != 4) + return 0; + + // E.g + // SWI R17, [R5 + 4] <- BaseOffset + // SWI R18, [R5 + 8] + // => + // SMW_AI [R5], {R17, R18} + if (isi32Store(Opcode) && (BaseOffset == 4)) { + ++NumSMW_AIGened; + return NDS32::SMW_AI; + } + // E.g + // LWI R17, [R5 + 4] <- BaseOffset + // LWI R18, [R5 + 8] + // => + // LMW_AI [R5], {R17, R18} + if (isLoadSingle(Opcode) && (BaseOffset == 4)) { + ++NumLMW_AIGened; + return NDS32::LMW_AI; + } + + // We need to generate extra Base adjustment ADDI before + // multiple load/store if BaseOffse != 0. + // We simply ignore the case here. + if ((BaseOffset != 0) && (LastOffset != 0)) + return 0; + + // E.g + // SWI R17, [R5 - 4] + // SWI R18, [R5 + 0] <- LastOffset + // => + // SMW_BD [R5], {R17, R18} + if (isi32Store(Opcode) && (LastOffset == 0)) { + ++NumSMW_BDGened; + return NDS32::SMW_BD; + } + + // E.g + // SWI R17, [R5 + 0] <- BaseOffset + // SWI R18, [R5 + 4] + // => + // SMW_BI [R5], {R17, R18} + if (isi32Store(Opcode) && (BaseOffset == 0)) { + ++NumSMW_BIGened; + return NDS32::SMW_BI; + } + + // E.g + // LWI R17, [R5 - 4] + // LWI R18, [R5 + 0] <- LastOffset + // => + // LMW_BD [R5], {R17, R18} + if (isLoadSingle(Opcode) && (LastOffset == 0)) { + ++NumLMW_BDGened; + return NDS32::LMW_BD; + } + + // E.g + // LWI R17, [R5 + 0] <- BaseOffset + // LWI R18, [R5 + 4] + // => + // LMW_BI [R5], {R17, R18} + if (isLoadSingle(Opcode) && (BaseOffset == 0)) { + ++NumLMW_BIGened; + return NDS32::LMW_BI; + } + return 0; +} + +static int AddRegToListWithoutEnable4(MachineInstrBuilder LSMW, + unsigned Reg) { + switch (Reg) { + case NDS32::SP: + return 1; + case NDS32::LP: + return 2; + case NDS32::GP: + return 4; + case NDS32::FP: + return 8; + default: + break; + } + + LSMW.addReg(Reg, RegState::Kill); // add Register list + return 0; +} + +static void CreateRegList(MachineInstrBuilder LSMW, + ArrayRef> Regs, + int OffsetDiff) { + int Enable4 = 0; + + // Register number is decrement in Regs if OffsetDiff == -4. + // We want Reglist to be increment when generate multiple load/store. + if (OffsetDiff == -4) { + for (int I = Regs.size() - 1; I > -1; I--) + Enable4 += AddRegToListWithoutEnable4 (LSMW, Regs[I].first); + } else { + for (const std::pair &R : Regs) + Enable4 += AddRegToListWithoutEnable4 (LSMW, R.first); + } + + // Insert Enable4 registers in the end of the RegList + if (Enable4 & 8) + LSMW.addReg(NDS32::FP, RegState::Kill); + if (Enable4 & 4) + LSMW.addReg(NDS32::GP, RegState::Kill); + if (Enable4 & 2) + LSMW.addReg(NDS32::LP, RegState::Kill); + if (Enable4 & 1) + LSMW.addReg(NDS32::SP, RegState::Kill); +} + +/// Create and insert a LMW or SMW with Base as base register and registers in +/// Regs as the register operands that would be loaded / stored. It returns +/// true if the transformation is done. +MachineInstr *NDS32LoadStoreOpt::CreateLoadStoreMulti( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + int BaseOffset, int OffsetDiff, unsigned Base, bool BaseKill, + unsigned Opcode, const DebugLoc &DL, + ArrayRef> Regs, + MachineInstr *EarliestMI, MachineInstr *LatestMI) { + + MachineBasicBlock::iterator MBBIEarliest(EarliestMI); + MachineBasicBlock::iterator MBBILatest(LatestMI); + int AdjustAmount; + bool FindAdjustInsnBefore = false; + bool FindAdjustInsnAfter = false; + + // Find Adjustment instruction before store + // E.g. + // addi $sp, $sp, -88 <= BaseAdjustInstr + // swi $fp, [$sp + (84)] <= MBBIEarliest + // swi $r8, [$sp + (80)] + MachineBasicBlock::iterator BaseAdjustInstr + = findIncDecBefore(MBBIEarliest, Base, AdjustAmount); + + if (BaseAdjustInstr != MBB.end()) + FindAdjustInsnBefore = true; + // Find Adjustment instruction after load/store + // E.g. + // lwi $r8, [$sp + (80)] + // lwi $fp, [$sp + (84)] <= MBBILatest + // addi $sp, $sp, 88 <= BaseAdjustInstr + if (FindAdjustInsnBefore == false) { + BaseAdjustInstr = findIncDecAfter(MBBILatest, Base, AdjustAmount); + if (BaseAdjustInstr != MBB.end()) + FindAdjustInsnAfter = true; + } + + // The instruction in front of the iterator is the one we look at. + + unsigned LSMWOpcode = getLSMWOpcode(FindAdjustInsnBefore, + FindAdjustInsnAfter, AdjustAmount, + BaseOffset, OffsetDiff, Opcode, Regs); + + if (LSMWOpcode == 0) + return nullptr; + + if (LSMWOpcode == NDS32::SWI_BI) { + MachineInstrBuilder SWI_BI; + // Currently, only consider emit AdjustAmount by ADDI + if (!isInt<15>(AdjustAmount)) + return nullptr; + + SWI_BI = BuildMI(MBB, InsertBefore, DL, TII->get(NDS32::SWI_BI)); + SWI_BI.addReg(Base, RegState::Define); + SWI_BI.addReg(Regs[0].first, RegState::Kill); + SWI_BI.addReg(Base, RegState::Define); + SWI_BI.addImm (AdjustAmount); + + // Remove original Base adjustment instruction. + if (BaseAdjustInstr != MBB.end()) + MBB.erase(BaseAdjustInstr); + + return SWI_BI; + } else if (LSMWOpcode == NDS32::LMW_BIM) { + MachineInstrBuilder LMW; + + AdjustAmount = AdjustAmount - 4 * Regs.size(); + + // Currently, only consider emit AdjustAmount by ADDI + if (!isInt<15>(AdjustAmount)) + return nullptr; + + // Emit new Base adjustment instruction. + if (AdjustAmount) + BuildMI(MBB, InsertBefore, DL, TII->get(NDS32::ADDI)) + .addReg(Base, RegState::Define) + .addReg(Base).addImm (AdjustAmount); + + LMW = BuildMI(MBB, InsertBefore, DL, TII->get(LSMWOpcode)); + LMW.addReg(Base, RegState::Define); // add $wb + LMW.addReg(Base, getKillRegState(BaseKill)); // add BaseReg [Ra] + + CreateRegList (LMW, Regs, OffsetDiff); + + // Remove original Base adjustment instruction. + if (BaseAdjustInstr != MBB.end()) + MBB.erase(BaseAdjustInstr); + + return LMW; + } else if (LSMWOpcode == NDS32::SMW_ADM) { + MachineInstrBuilder SMW; + + AdjustAmount = AdjustAmount + 4 * Regs.size(); + + // Currently, only consider emit AdjustAmount by ADDI + if (!isInt<15>(AdjustAmount)) + return nullptr; + + SMW = BuildMI(MBB, InsertBefore, DL, TII->get(LSMWOpcode)); + SMW.addReg(Base, RegState::Define); // add $wb + SMW.addReg(Base, RegState::Define); // add Base [Ra] + + CreateRegList (SMW, Regs, OffsetDiff); + + // Emit new Base adjustment instruction. + if (AdjustAmount) + BuildMI(MBB, InsertBefore, DL, TII->get(NDS32::ADDI)) + .addReg(Base).addReg(Base).addImm (AdjustAmount); + + // Remove original Base adjustment instruction. + if (BaseAdjustInstr != MBB.end()) + MBB.erase(BaseAdjustInstr); + + return SMW; + } else { + MachineInstrBuilder LSMW; + + LSMW = BuildMI(MBB, InsertBefore, DL, TII->get(LSMWOpcode)); + LSMW.addReg(Base); // add Base [Ra] + + CreateRegList(LSMW, Regs, OffsetDiff); + return LSMW; + } + + return nullptr; +} + +// Determine the load/store instructions could be merged +static bool isInstCanMerge(unsigned CurOpcode, unsigned Opcode) { + if (CurOpcode == Opcode) + return true; + // if instructions are i32 bit store without Modify Base return true + if (isi32StoreWithoutBI(CurOpcode) && isi32StoreWithoutBI(Opcode)) + return true; + // if instructions are i32 bit load without Modify Base return true + if (isi32LoadWithoutBI(CurOpcode) && isi32LoadWithoutBI(Opcode)) + return true; + return false; +} + +/// An optimization pass to turn multiple load / store ops of the same base and +/// incrementing offset into LMW / SMW ops. +bool NDS32LoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { + MemOpQueue MemOps; + unsigned CurrBase = 0; + unsigned CurrOpc = ~0u; + unsigned Position = 0; + + for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); + I = MBBI) { + // The instruction in front of the iterator is the one we look at. + MBBI = std::prev(I); + ++Position; + + if (isMemoryOp(*MBBI)) { + unsigned Opcode = MBBI->getOpcode(); + const MachineOperand &MO = MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + unsigned Base = getLoadStoreBaseReg(*MBBI); + int Offset = getMemoryOpOffset(*MBBI); + + if (CurrBase == 0) { + // Start of a new chain. + CurrBase = Base; + CurrOpc = Opcode; + MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Reg, Position)); + continue; + } + // Note: No need to match PredReg in the next if. + if (isInstCanMerge(CurrOpc, Opcode) && CurrBase == Base) { + // Watch out for: + // r4 := lwi [r0, #8] + // r4 := lwi [r0, #4] + // or + // r0 := lwi [r0] + // If a load overrides the base register or a register loaded by + // another load in our chain, we cannot take this instruction. + bool Overlap = false; + if (isLoadSingle(Opcode)) { + Overlap = (Base == Reg); + if (!Overlap) { + for (const MemOpQueueEntry &E : MemOps) { + if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + } + } + if (!Overlap) { + // Check offset and sort memory operation by offset + // into the current chain. + if (Offset > MemOps.back().Offset) { + MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Reg, Position)); + continue; + } else { + MemOpQueue::iterator MI, ME; + for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) { + if (Offset < MI->Offset) { + // Found a place to insert. + break; + } + if (Offset == MI->Offset) { + // Collision, abort. + MI = ME; + break; + } + } + if (MI != MemOps.end()) { + MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Reg, Position)); + continue; + } + } + } + } + + // Don't advance the iterator; The op will start a new chain next. + MBBI = I; + --Position; + // Fallthrough to look into existing chain. + } else if (MBBI->isDebugValue()) + continue; + + // If we are here then the chain is broken; Extract candidates for a merge. + if (MemOps.size() > 0) { + FormCandidates(MemOps); + // Reset for the next chain. + CurrBase = 0; + CurrOpc = ~0u; + MemOps.clear(); + } + + } + if (MemOps.size() > 0) + FormCandidates(MemOps); + + // Sort candidates so they get processed from end to begin of the basic + // block later; This is necessary for liveness calculation. + auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { + return M0->InsertPos < M1->InsertPos; + }; + std::sort(Candidates.begin(), Candidates.end(), LessThan); + + // Go through list of candidates and merge. + bool Changed = false; + for (const MergeCandidate *Candidate : Candidates) { + if (Candidate->CanMergeToLSMulti) { + MachineInstr *Merged = MergeOpsUpdate(*Candidate); + // Merge preceding/trailing base inc/dec into the merged op. + if (Merged) { + Changed = true; + MergeBaseUpdateLSMultiple(Merged); + } + } else { + assert(Candidate->Instrs.size() == 1); + } + } + Candidates.clear(); + MergeBaseCandidates.clear(); + + return Changed; +} + +bool NDS32LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(*Fn.getFunction())) + return false; + + MF = &Fn; + STI = &static_cast(Fn.getSubtarget()); + TL = STI->getTargetLowering(); + NFI = Fn.getInfo(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + + RegClassInfoValid = false; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= LoadStoreMultipleOpti(MBB); + } + + Allocator.DestroyAll(); + return Modified; +} + +/// Returns an instance of the load / store optimization pass. +FunctionPass *llvm::createNDS32LoadStoreOptimizationPass(bool PreAlloc) { + return new NDS32LoadStoreOpt(); +} Index: lib/Target/NDS32/NDS32TargetMachine.cpp =================================================================== --- lib/Target/NDS32/NDS32TargetMachine.cpp +++ lib/Target/NDS32/NDS32TargetMachine.cpp @@ -21,9 +21,17 @@ #include "llvm/Support/TargetRegistry.h" using namespace llvm; +static cl::opt +EnableNDS32LoadStoreOpt("nds32-load-store-opt", cl::Hidden, + cl::desc("Enable NDS32 load/store optimization pass"), + cl::init(true)); + extern "C" void LLVMInitializeNDS32Target() { // Register the target. RegisterTargetMachine X(TheNDS32Target); + + PassRegistry &Registry = *PassRegistry::getPassRegistry(); + initializeNDS32LoadStoreOptPass(Registry); } static Reloc::Model getEffectiveRelocModel(Optional RM) { @@ -116,6 +124,7 @@ } bool addInstSelector() override; + void addPreSched2() override; }; } // namespace @@ -128,3 +137,10 @@ addPass(createNDS32ISelDag(getNDS32TargetMachine(), getOptLevel())); return false; } + +void NDS32PassConfig::addPreSched2() { + if (getOptLevel() != CodeGenOpt::None) { + if (EnableNDS32LoadStoreOpt) + addPass(createNDS32LoadStoreOptimizationPass()); + } +}