Index: lib/Target/NDS32/CMakeLists.txt =================================================================== --- lib/Target/NDS32/CMakeLists.txt +++ lib/Target/NDS32/CMakeLists.txt @@ -21,6 +21,7 @@ NDS32MachineFunctionInfo.cpp NDS32MCInstLower.cpp NDS32AsmPrinter.cpp + NDS32LoadStoreOptimizer.cpp ) add_subdirectory(TargetInfo) Index: lib/Target/NDS32/InstPrinter/NDS32InstPrinter.h =================================================================== --- lib/Target/NDS32/InstPrinter/NDS32InstPrinter.h +++ lib/Target/NDS32/InstPrinter/NDS32InstPrinter.h @@ -47,6 +47,8 @@ void printSPImplyAddrImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O); }; } Index: lib/Target/NDS32/InstPrinter/NDS32InstPrinter.cpp =================================================================== --- lib/Target/NDS32/InstPrinter/NDS32InstPrinter.cpp +++ lib/Target/NDS32/InstPrinter/NDS32InstPrinter.cpp @@ -134,3 +134,54 @@ O << "+ " << markup("") << "]" << markup(">"); } + +// print register list in (Rb ,[Ra], Re, Enable4) form +// 1. Ra is the base register +// 2. Rb is the first register in the register list +// 3. Re is the last register in the register list +// 4. Enable4 encode FP, GP, LP, SP if the registers exist in the register list +// 5. Rb and Re should set to SP if there is no R0~R27 registers exist +// in the register list +void NDS32InstPrinter::printRegisterList(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + unsigned Rb = NDS32::SP; + unsigned Re = NDS32::SP; + unsigned Ra; + unsigned Enable4 = 0; + unsigned i; + unsigned e = MI->getNumOperands(); + + // First operand will be base register [Ra] + Ra = MI->getOperand(OpNum).getReg(); + + for (i = OpNum + 1; i != e; ++i) { + unsigned Reg = MI->getOperand(i).getReg(); + switch (Reg) { + case NDS32::SP: + Enable4 += 1; + break; + case NDS32::LP: + Enable4 += 2; + break; + case NDS32::GP: + Enable4 += 4; + break; + case NDS32::FP: + Enable4 += 8; + break; + default: + if (Rb == NDS32::SP) + Rb = Reg; + Re = Reg; + break; + } + } + O << '$'; + printRegName(O, Rb); + O << ", [$"; + printRegName(O, Ra); + O << "], $"; + printRegName(O, Re); + O << ", " << Enable4; +} Index: lib/Target/NDS32/NDS32.h =================================================================== --- lib/Target/NDS32/NDS32.h +++ lib/Target/NDS32/NDS32.h @@ -24,6 +24,10 @@ FunctionPass *createNDS32ISelDag(NDS32TargetMachine &TM, CodeGenOpt::Level OptLevel); + + FunctionPass *createNDS32LoadStoreOptimizationPass(bool PreAlloc = false); + + void initializeNDS32LoadStoreOptPass(PassRegistry &); } // end namespace llvm; #endif Index: lib/Target/NDS32/NDS32InstrFormats.td =================================================================== --- lib/Target/NDS32/NDS32InstrFormats.td +++ lib/Target/NDS32/NDS32InstrFormats.td @@ -206,6 +206,18 @@ let Inst{7-0} = subop_encode; } +class LSMWForm pattern, bit isStore> + : 32BitInst<0b011101, outs, ins, asmstr, pattern> { + bits<19> regs; + let Inst{24-20} = regs{18-14};// Rb + let Inst{19-15} = regs{13-9}; // Ra + let Inst{14-10} = regs{8-4}; // Re + let Inst{9-6} = regs{3-0}; // Enable4 + //It's a multiple store instruction if isStore == 1 + let Inst{5} = isStore; + let Inst{1-0} = 0b00; +} + class JIForm pattern> : 32BitInst<0b100100, outs, ins, asmstr, pattern>; Index: lib/Target/NDS32/NDS32InstrInfo.td =================================================================== --- lib/Target/NDS32/NDS32InstrInfo.td +++ lib/Target/NDS32/NDS32InstrInfo.td @@ -459,6 +459,51 @@ //===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +class lsmw_modify_base : + LSMWForm<(outs GPR:$wb), (ins reglist:$regs, variable_ops), + !strconcat(opstr,"\t$regs"), [], isStore> { + let Constraints = "$regs.base = $wb"; + //Use either base+4 or base-4 as first memory address if use_next_pos == 1 + let Inst{4} = use_next_pos; + let Inst{3} = decrement; + let Inst{2} = 1; // Modify base register + let DecoderMethod = "DecodeLSMWModifyBase"; +} + +class lsmw : + LSMWForm<(outs), (ins reglist:$regs, variable_ops), + !strconcat(opstr,"\t$regs"), [], isStore> { + //Use either base+4 or base-4 as first memory address if use_next_pos == 1 + let Inst{4} = use_next_pos; + let Inst{3} = decrement; + let Inst{2} = 0; // Not modify base register +} + +multiclass nds32_lsmw { + def _ADM : lsmw_modify_base ; + def _BIM : lsmw_modify_base ; + def _AD : lsmw ; + def _BD : lsmw ; + def _BI : lsmw ; + def _AI : lsmw ; +} + +let hasSideEffects = 0 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm LMW : nds32_lsmw<"lmw", 0>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm SMW : nds32_lsmw<"smw", 1>; + +} // hasSideEffects + + +//===----------------------------------------------------------------------===// // Control Flow Instructions // Index: lib/Target/NDS32/NDS32LoadStoreOptimizer.cpp =================================================================== --- /dev/null +++ lib/Target/NDS32/NDS32LoadStoreOptimizer.cpp @@ -0,0 +1,970 @@ +//===-- NDS32LoadStoreOptimizer.cpp - NDS32 load / store opt. pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a pass that performs load / store related peephole +/// optimizations. +// +//===----------------------------------------------------------------------===// + +#include "NDS32.h" +#include "NDS32InstrInfo.h" +#include "NDS32RegisterInfo.h" +#include "NDS32ISelLowering.h" +#include "NDS32MachineFunctionInfo.h" +#include "NDS32Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "nds32-ldst-opt" + +STATISTIC(NumLMW_BIMGened , "Number of lmw.bim instructions generated"); +STATISTIC(NumLMW_BDGened , "Number of lmw.bd instructions generated"); +STATISTIC(NumLMW_BIGened , "Number of lmw.bi instructions generated"); +STATISTIC(NumLMW_AIGened , "Number of lmw.ai instructions generated"); +STATISTIC(NumSMW_ADMGened , "Number of smw.adm instructions generated"); +STATISTIC(NumSMW_BDGened , "Number of smw.bd instructions generated"); +STATISTIC(NumSMW_BIGened , "Number of smw.bi instructions generated"); +STATISTIC(NumSMW_AIGened , "Number of smw.ai instructions generated"); +STATISTIC(NumSWI_BIGened , "Number of swi.bi instructions generated"); + +#define NDS32_LOAD_STORE_OPT_NAME "NDS32 load / store optimization pass" + +namespace { + struct NDS32LoadStoreOpt : public MachineFunctionPass { + static char ID; + NDS32LoadStoreOpt() : MachineFunctionPass(ID) {} + + const MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const NDS32Subtarget *STI; + const TargetLowering *TL; + NDS32MachineFunctionInfo *NFI; + LivePhysRegs LiveRegs; + RegisterClassInfo RegClassInfo; + MachineBasicBlock::const_iterator LiveRegPos; + bool LiveRegsValid; + bool RegClassInfoValid; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { return NDS32_LOAD_STORE_OPT_NAME; } + + private: + /// A set of load/store MachineInstrs with same base register. + struct MemOpQueueEntry { + MachineInstr *MI; + int Offset; ///< Load/Store offset. + unsigned Reg; ///< Load/Store reg + unsigned Position; ///< Position as counted from end of basic block. + MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Reg, + unsigned Position) + : MI(&MI), Offset(Offset), Reg(Reg), Position(Position) {} + }; + typedef SmallVector MemOpQueue; + + /// A set of MachineInstrs that fulfill (nearly all) conditions to get + /// merged into a LMW/SMW. + struct MergeCandidate { + /// List of instructions ordered by load/store offset. + SmallVector Instrs; + /// Index in Instrs of the instruction being latest in the schedule. + unsigned LatestMIIdx; + /// Index in Instrs of the instruction being earliest in the schedule. + unsigned EarliestMIIdx; + /// Index into the basic block where the merged instruction will be + /// inserted. (See MemOpQueueEntry.Position) + unsigned InsertPos; + /// Whether the instructions can be merged into a lmw/smw instruction. + bool CanMergeToLSMulti; + }; + SpecificBumpPtrAllocator Allocator; + SmallVector Candidates; + SmallVector MergeBaseCandidates; + + MachineInstr *CreateLoadStoreMulti( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + int BaseOffset, int OffsetDiff, unsigned Base, bool BaseKill, + unsigned Opcode, const DebugLoc &DL, + ArrayRef> Regs, + MachineInstr *EarliestMI, MachineInstr *LatestMI); + + MachineBasicBlock::iterator findIncDecBefore( + MachineBasicBlock::iterator MBBI, unsigned Reg, int &Offset); + + bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); + void FormCandidates(const MemOpQueue &MemOps); + bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + }; + char NDS32LoadStoreOpt::ID = 0; +} + +INITIALIZE_PASS(NDS32LoadStoreOpt, "nds32-ldst-opt", NDS32_LOAD_STORE_OPT_NAME, + false, false) + +static bool isLoadSingle(unsigned Opc) { + return Opc == NDS32::LWI || Opc == NDS32::LWI_BI || + Opc == NDS32::LWI333 || + Opc == NDS32::LWI37 || Opc == NDS32::LWI37SP; +} + +static bool isi32Store(unsigned Opc) { + return Opc == NDS32::SWI || Opc == NDS32::SWI_BI || + Opc == NDS32::SWI333 || + Opc == NDS32::SWI37 || Opc == NDS32::SWI37SP; +} + +static bool isi32StoreWithoutBI(unsigned Opc) { + return Opc == NDS32::SWI || Opc == NDS32::SWI333 || + Opc == NDS32::SWI37 || Opc == NDS32::SWI37SP; +} + +static bool isi32LoadWithoutBI(unsigned Opc) { + return Opc == NDS32::LWI || Opc == NDS32::LWI333 || + Opc == NDS32::LWI37 || Opc == NDS32::LWI37SP; +} + +static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) { + return MI.getOperand(0); +} + +static unsigned getLoadStoreBaseReg(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case NDS32::SWI37: + case NDS32::LWI37: + return NDS32::FP; + case NDS32::SWI37SP: + case NDS32::LWI37SP: + return NDS32::SP; + default: + break; + } + return MI.getOperand(1).getReg(); +} + +static int getMemoryOpOffset(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case NDS32::SWI_BI: + case NDS32::LWI_BI: + return MI.getOperand(3).getImm(); + default: + break; + } + return MI.getOperand(2).getImm(); +} + +/// Returns true if instruction is a memory operation that this pass is capable +/// of operating on. +static bool isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case NDS32::SWI: + case NDS32::LWI: + case NDS32::SWI333: + case NDS32::LWI333: + case NDS32::SWI_BI: + case NDS32::LWI_BI: + case NDS32::SWI37: + case NDS32::LWI37: + case NDS32::SWI37SP: + case NDS32::LWI37SP: + break; + default: + return false; + } + return true; +} + +/// Find candidates for load/store multiple merge in list of MemOpQueueEntries. +void NDS32LoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { + + unsigned SIndex = 0; + unsigned EIndex = MemOps.size(); + + do { + // Look at the first instruction. + int Offset = MemOps[SIndex].Offset; + int POffset = Offset; + unsigned Latest = SIndex; + unsigned Earliest = SIndex; + unsigned Count = 1; + + bool CanMergeToLSMulti = true; + bool DetectOffset = true; + bool IsOffsetInc = true; + + // Merge following instructions where possible. + for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { + Offset = MemOps[I].Offset; + + // Offset should be increment or decrement by 4 + if (DetectOffset) { + if (Offset + 4 == POffset) + IsOffsetInc = false; + else if (Offset - 4 == POffset) + IsOffsetInc = true; + else + break; + DetectOffset = false; + } + bool PartOfLSMulti = CanMergeToLSMulti; + if (PartOfLSMulti) { + // If previous Offset is increment by 4 but current Offset not. + if (!IsOffsetInc && (Offset + 4 != POffset)) + PartOfLSMulti = false; + // If previous Offset is decrement by 4 but current Offset not. + else if (IsOffsetInc && (Offset - 4 != POffset)) + PartOfLSMulti = false; + } + + if (!PartOfLSMulti) + break; + CanMergeToLSMulti &= PartOfLSMulti; + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. + POffset = Offset; + } + // Form a candidate from the Ops collected so far. + MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate; + for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) + Candidate->Instrs.push_back(MemOps[C].MI); + Candidate->LatestMIIdx = Latest - SIndex; + Candidate->EarliestMIIdx = Earliest - SIndex; + Candidate->InsertPos = MemOps[Latest].Position; + Candidate->CanMergeToLSMulti = CanMergeToLSMulti; + Candidates.push_back(Candidate); + // Continue after the chain. + SIndex += Count; + } while (SIndex < EIndex); +} + +/// Call MergeOps and update MemOps and merges accordingly on success. +MachineInstr *NDS32LoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { + const MachineInstr *First = Cand.Instrs.front(); + int BaseOffset = getMemoryOpOffset(*First); + unsigned Opcode = First->getOpcode(); + bool IsLoad = isLoadSingle(Opcode); + SmallVector, 32> Regs; + SmallVector ImpDefs; + DenseSet KilledRegs; + DenseSet UsedRegs; + int OffsetDiff = 0; + // Determine list of registers and list of implicit super-register defs. + for (const MachineInstr *MI : Cand.Instrs) { + const MachineOperand &MO = getLoadStoreRegOp(*MI); + int CurOffset = getMemoryOpOffset(*MI); + unsigned Reg = MO.getReg(); + if ((OffsetDiff == 0) && (BaseOffset != CurOffset)) { + OffsetDiff = CurOffset - BaseOffset; + } + + bool IsKill = MO.isKill(); + if (IsKill) + KilledRegs.insert(Reg); + UsedRegs.insert(Reg); + + Regs.push_back(std::make_pair(Reg, IsKill)); + + if (IsLoad) { + // Collect any implicit defs of super-registers, after merging we can't + // be sure anymore that we properly preserved these live ranges and must + // removed these implicit operands. + for (const MachineOperand &MO : MI->implicit_operands()) { + if (!MO.isReg() || !MO.isDef() || MO.isDead()) + continue; + assert(MO.isImplicit()); + unsigned DefReg = MO.getReg(); + + if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end()) + continue; + // We can ignore cases where the super-reg is read and written. + if (MI->readsRegister(DefReg)) + continue; + ImpDefs.push_back(DefReg); + } + } + } + // Attempt the merge. + typedef MachineBasicBlock::iterator iterator; + MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx]; + MachineInstr *EarliestMI = Cand.Instrs[Cand.EarliestMIIdx]; + iterator InsertBefore = std::next(iterator(LatestMI)); + MachineBasicBlock &MBB = *LatestMI->getParent(); + unsigned Base = getLoadStoreBaseReg(*First); + bool BaseKill = LatestMI->killsRegister(Base); + DebugLoc DL = First->getDebugLoc(); + MachineInstr *Merged = nullptr; + + if (!Merged && Cand.CanMergeToLSMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, BaseOffset, OffsetDiff, + Base, BaseKill, Opcode, DL, Regs, + EarliestMI, LatestMI); + if (!Merged) + return nullptr; + + // Determine earliest instruction that will get removed. We then keep an + // iterator just above it so the following erases don't invalidated it. + iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]); + bool EarliestAtBegin = false; + if (EarliestI == MBB.begin()) { + EarliestAtBegin = true; + } else { + EarliestI = std::prev(EarliestI); + } + + // Remove instructions which have been merged. + for (MachineInstr *MI : Cand.Instrs) + MBB.erase(MI); + + // Determine range between the earliest removed instruction and the new one. + if (EarliestAtBegin) + EarliestI = MBB.begin(); + else + EarliestI = std::next(EarliestI); + auto FixupRange = make_range(EarliestI, iterator(Merged)); + + if (isLoadSingle(Opcode)) { + // If the previous loads defined a super-reg, then we have to mark earlier + // operands undef; Replicate the super-reg def on the merged instruction. + for (MachineInstr &MI : FixupRange) { + for (unsigned &ImpDefReg : ImpDefs) { + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg() || MO.getReg() != ImpDefReg) + continue; + if (MO.readsReg()) + MO.setIsUndef(); + else if (MO.isDef()) + ImpDefReg = 0; + } + } + } + + MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged); + for (unsigned ImpDef : ImpDefs) + MIB.addReg(ImpDef, RegState::ImplicitDefine); + } else { + // Remove kill flags: We are possibly storing the values later now. + assert(isi32Store(Opcode)); + for (MachineInstr &MI : FixupRange) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.isKill()) + continue; + if (UsedRegs.count(MO.getReg())) + MO.setIsKill(false); + } + } + assert(ImpDefs.empty()); + } + + return Merged; +} + +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg) { + switch (MI.getOpcode()) { + case NDS32::ADDI: + case NDS32::ADDI333: + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg) + return 0; + else + return MI.getOperand(2).getImm(); + default: + break; + } + + return 0; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +MachineBasicBlock::iterator NDS32LoadStoreOpt:: +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while ((PrevMBBI->isDebugValue() || PrevMBBI->isCFIInstruction()) + && PrevMBBI != BeginMBBI) + --PrevMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg); + + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && + (NextMBBI->isDebugValue() || NextMBBI->isCFIInstruction())) + ++NextMBBI; + + if (NextMBBI == EndMBBI) return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + +bool NDS32LoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { + const MachineOperand &BaseOP = MI->getOperand(0); + unsigned Base = BaseOP.getReg(); + DebugLoc DL = MI->getDebugLoc(); + + // Can't use an updating ld/st if the base register is also a dest + // register. e.g. lmw.bim [r0], {r0, r1, r2}. The behavior is undefined. + for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) + if (MI->getOperand(i).getReg() == Base) + return false; + + // ToDo: Generate extra ADDI for multiple load/store if it's profitable. + + return false; +} + +static unsigned getLSMWOpcode(bool FindAdjustInsnBefore, int AdjustAmountBefore, + bool FindAdjustInsnAfter, int AdjustAmountAfter, + int BaseOffset, int OffsetDiff, unsigned Opcode, + ArrayRef> Regs) { + unsigned PReg = 0; + int LastOffset = 0; + + // In NDS32 multiple load/store, the lowest-numbered register is loaded + // from the lowest memory address while the highest-numbered register + // is loaded from the highest memory address + // E.g. + // SWI R0, [Ra + 4] + // SWI R1, [Ra + 8] <- OffsetDiff = 4, PReg = R0 + // + // Checking the register number and offset here + for (const std::pair &R : Regs) { + // LP, GP, SP, FP will encoded in Enable4 field. + // These four registers don't have to consecutive with other registers. + switch (R.first){ + case NDS32::LP: case NDS32::GP: case NDS32::SP: case NDS32::FP: + continue; + default: + break; + } + + if (PReg == 0) { + PReg = R.first; + continue; + } + + if (PReg + 1 != R.first) + return 0; + PReg = R.first; + } + + LastOffset = BaseOffset + OffsetDiff * (Regs.size() - 1); + // E.g. SP = SP - 48 <- AdjustAmountBefore + // SWI R0, [SP + 40] <- BaseOffset + // SWI LP, [SP + 44] <- LastOffset + // => + // SMW_ADM [SP], {LP, R0} + // SP = SP - 40 + if (FindAdjustInsnBefore && (LastOffset + AdjustAmountBefore == -4) + && isi32Store(Opcode)) { + ++NumSMW_ADMGened; + return NDS32::SMW_ADM; + } + + // E.g + // SWI R2, [R1 + 0] <- BaseOffset + // ADDI R1, R1, -4 <- AdjustAmountAfter + // => + // SWI_BI R2, [R1], -4 + if (FindAdjustInsnAfter && (BaseOffset + AdjustAmountAfter == -4) + && isi32Store(Opcode) && (Regs.size () == 1)) { + ++NumSWI_BIGened; + return NDS32::SWI_BI; + } + + // E.g. + // LWI R0, [SP + 40] <- BaseOffset + // LWI LP, [SP + 44] <- LastOffset + // SP = SP + 48 <- AdjustAmountAfter + // => + // SP = SP + 40 + // LMW_BIM [SP], {LP, R0} + if (FindAdjustInsnAfter && (LastOffset + 4 == AdjustAmountAfter) + && isLoadSingle(Opcode)) { + ++NumLMW_BIMGened; + return NDS32::LMW_BIM; + } + + // Skip single load/store + if (Regs.size() == 1) + return 0; + + if (OffsetDiff != 4) + return 0; + + // E.g + // SWI R17, [R5 + 4] <- BaseOffset + // SWI R18, [R5 + 8] + // => + // SMW_AI [R5], {R17, R18} + if (isi32Store(Opcode) && (BaseOffset == 4)) { + ++NumSMW_AIGened; + return NDS32::SMW_AI; + } + // E.g + // LWI R17, [R5 + 4] <- BaseOffset + // LWI R18, [R5 + 8] + // => + // LMW_AI [R5], {R17, R18} + if (isLoadSingle(Opcode) && (BaseOffset == 4)) { + ++NumLMW_AIGened; + return NDS32::LMW_AI; + } + + // We need to generate extra Base adjustment ADDI before + // multiple load/store if BaseOffse != 0. + // We simply ignore the case here. + if ((BaseOffset != 0) && (LastOffset != 0)) + return 0; + + // E.g + // SWI R17, [R5 - 4] + // SWI R18, [R5 + 0] <- LastOffset + // => + // SMW_BD [R5], {R17, R18} + if (isi32Store(Opcode) && (LastOffset == 0)) { + ++NumSMW_BDGened; + return NDS32::SMW_BD; + } + + // E.g + // SWI R17, [R5 + 0] <- BaseOffset + // SWI R18, [R5 + 4] + // => + // SMW_BI [R5], {R17, R18} + if (isi32Store(Opcode) && (BaseOffset == 0)) { + ++NumSMW_BIGened; + return NDS32::SMW_BI; + } + + // E.g + // LWI R17, [R5 - 4] + // LWI R18, [R5 + 0] <- LastOffset + // => + // LMW_BD [R5], {R17, R18} + if (isLoadSingle(Opcode) && (LastOffset == 0)) { + ++NumLMW_BDGened; + return NDS32::LMW_BD; + } + + // E.g + // LWI R17, [R5 + 0] <- BaseOffset + // LWI R18, [R5 + 4] + // => + // LMW_BI [R5], {R17, R18} + if (isLoadSingle(Opcode) && (BaseOffset == 0)) { + ++NumLMW_BIGened; + return NDS32::LMW_BI; + } + return 0; +} + +static int AddRegToListWithoutEnable4(MachineInstrBuilder LSMW, + unsigned Reg) { + switch (Reg) { + case NDS32::SP: + return 1; + case NDS32::LP: + return 2; + case NDS32::GP: + return 4; + case NDS32::FP: + return 8; + default: + break; + } + + LSMW.addReg(Reg, RegState::Kill); // add Register list + return 0; +} + +static void CreateRegList(MachineInstrBuilder LSMW, + ArrayRef> Regs, + int OffsetDiff) { + int Enable4 = 0; + + // Register number is decrement in Regs if OffsetDiff == -4. + // We want Reglist to be increment when generate multiple load/store. + if (OffsetDiff == -4) { + for (int I = Regs.size() - 1; I > -1; I--) + Enable4 += AddRegToListWithoutEnable4 (LSMW, Regs[I].first); + } else { + for (const std::pair &R : Regs) + Enable4 += AddRegToListWithoutEnable4 (LSMW, R.first); + } + + // Insert Enable4 registers in the end of the RegList + if (Enable4 & 8) + LSMW.addReg(NDS32::FP, RegState::Kill); + if (Enable4 & 4) + LSMW.addReg(NDS32::GP, RegState::Kill); + if (Enable4 & 2) + LSMW.addReg(NDS32::LP, RegState::Kill); + if (Enable4 & 1) + LSMW.addReg(NDS32::SP, RegState::Kill); +} + +#include +/// Create and insert a LMW or SMW with Base as base register and registers in +/// Regs as the register operands that would be loaded / stored. It returns +/// true if the transformation is done. +MachineInstr *NDS32LoadStoreOpt::CreateLoadStoreMulti( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + int BaseOffset, int OffsetDiff, unsigned Base, bool BaseKill, + unsigned Opcode, const DebugLoc &DL, + ArrayRef> Regs, + MachineInstr *EarliestMI, MachineInstr *LatestMI) { + + MachineBasicBlock::iterator MBBIEarliest(EarliestMI); + MachineBasicBlock::iterator MBBILatest(LatestMI); + int AdjustAmountBefore; + int AdjustAmountAfter; + bool FindAdjustInsnBefore = false; + bool FindAdjustInsnAfter = false; + + // Find Adjustment instruction before store + // E.g. + // addi $sp, $sp, -88 <= BaseAdjustInstrBefore + // swi $fp, [$sp + (84)] <= MBBIEarliest + // swi $r8, [$sp + (80)] + MachineBasicBlock::iterator BaseAdjustInstrBefore + = findIncDecBefore(MBBIEarliest, Base, AdjustAmountBefore); + + if (BaseAdjustInstrBefore != MBB.end()) + FindAdjustInsnBefore = true; + + // Find Adjustment instruction after load/store + // E.g. + // lwi $r8, [$sp + (80)] + // lwi $fp, [$sp + (84)] <= MBBILatest + // addi $sp, $sp, 88 <= BaseAdjustInstrAfter + MachineBasicBlock::iterator BaseAdjustInstrAfter + = findIncDecAfter(MBBILatest, Base, AdjustAmountAfter); + + if (BaseAdjustInstrAfter != MBB.end()) + FindAdjustInsnAfter = true; + + unsigned LSMWOpcode = getLSMWOpcode(FindAdjustInsnBefore, AdjustAmountBefore, + FindAdjustInsnAfter, AdjustAmountAfter, + BaseOffset, OffsetDiff, Opcode, Regs); + + if (LSMWOpcode == 0) + return nullptr; + + if (LSMWOpcode == NDS32::SWI_BI) { + MachineInstrBuilder SWI_BI; + // Currently, only consider emit AdjustAmount by ADDI + if (!isInt<15>(AdjustAmountAfter)) + return nullptr; + + SWI_BI = BuildMI(MBB, InsertBefore, DL, TII->get(NDS32::SWI_BI)); + SWI_BI.addReg(Base, RegState::Define); + SWI_BI.addReg(Regs[0].first, RegState::Kill); + SWI_BI.addReg(Base, RegState::Define); + SWI_BI.addImm (AdjustAmountAfter); + + // Remove original Base adjustment instruction. + if (BaseAdjustInstrAfter != MBB.end()) + MBB.erase(BaseAdjustInstrAfter); + + return SWI_BI; + } else if (LSMWOpcode == NDS32::LMW_BIM) { + MachineInstrBuilder LMW; + + int AdjustAmount = AdjustAmountAfter - 4 * Regs.size(); + + // Currently, only consider emit AdjustAmount by ADDI + if (!isInt<15>(AdjustAmount)) + return nullptr; + + // Emit new Base adjustment instruction. + if (AdjustAmount) + BuildMI(MBB, InsertBefore, DL, TII->get(NDS32::ADDI)) + .addReg(Base, RegState::Define) + .addReg(Base).addImm (AdjustAmount); + + LMW = BuildMI(MBB, InsertBefore, DL, TII->get(LSMWOpcode)); + LMW.addReg(Base, RegState::Define); // add $wb + LMW.addReg(Base, getKillRegState(BaseKill)); // add BaseReg [Ra] + + CreateRegList (LMW, Regs, OffsetDiff); + + // Remove original Base adjustment instruction. + if (BaseAdjustInstrAfter != MBB.end()) + MBB.erase(BaseAdjustInstrAfter); + + return LMW; + } else if (LSMWOpcode == NDS32::SMW_ADM) { + MachineInstrBuilder SMW; + + int AdjustAmount = AdjustAmountBefore + 4 * Regs.size(); + + // Currently, only consider emit AdjustAmount by ADDI + if (!isInt<15>(AdjustAmount)) + return nullptr; + + SMW = BuildMI(MBB, InsertBefore, DL, TII->get(LSMWOpcode)); + SMW.addReg(Base, RegState::Define); // add $wb + SMW.addReg(Base, getKillRegState(BaseKill)); // add Base [Ra] + + CreateRegList (SMW, Regs, OffsetDiff); + + // Emit new Base adjustment instruction. + if (AdjustAmount) + BuildMI(MBB, InsertBefore, DL, TII->get(NDS32::ADDI)) + .addReg(Base, RegState::Define) + .addReg(Base).addImm (AdjustAmount); + + // Remove original Base adjustment instruction. + if (BaseAdjustInstrBefore != MBB.end()) + MBB.erase(BaseAdjustInstrBefore); + + return SMW; + } else { + MachineInstrBuilder LSMW; + + LSMW = BuildMI(MBB, InsertBefore, DL, TII->get(LSMWOpcode)); + LSMW.addReg(Base); // add Base [Ra] + + CreateRegList(LSMW, Regs, OffsetDiff); + return LSMW; + } + + return nullptr; +} + +// Determine the load/store instructions could be merged +static bool isInstCanMerge(unsigned CurOpcode, unsigned Opcode) { + if (CurOpcode == Opcode) + return true; + // if instructions are i32 bit store without Modify Base return true + if (isi32StoreWithoutBI(CurOpcode) && isi32StoreWithoutBI(Opcode)) + return true; + // if instructions are i32 bit load without Modify Base return true + if (isi32LoadWithoutBI(CurOpcode) && isi32LoadWithoutBI(Opcode)) + return true; + return false; +} + +/// An optimization pass to turn multiple load / store ops of the same base and +/// incrementing offset into LMW / SMW ops. +bool NDS32LoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { + MemOpQueue MemOps; + unsigned CurrBase = 0; + unsigned CurrOpc = ~0u; + unsigned Position = 0; + + for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); + I = MBBI) { + // The instruction in front of the iterator is the one we look at. + MBBI = std::prev(I); + ++Position; + + if (isMemoryOp(*MBBI)) { + unsigned Opcode = MBBI->getOpcode(); + const MachineOperand &MO = MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + unsigned Base = getLoadStoreBaseReg(*MBBI); + int Offset = getMemoryOpOffset(*MBBI); + + // Base register should not exist in register list + // for NDS32 multiple load/store instructions. + // E.g. lmw.bim $r3, [$r3], $r3, 0 is forbidden. + if (Base == Reg) + continue; + + if (CurrBase == 0) { + // Start of a new chain. + CurrBase = Base; + CurrOpc = Opcode; + MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Reg, Position)); + continue; + } + // Note: No need to match PredReg in the next if. + if (isInstCanMerge(CurrOpc, Opcode) && CurrBase == Base) { + // Watch out for: + // r4 := lwi [r0, #8] + // r4 := lwi [r0, #4] + // or + // r0 := lwi [r0] + // If a load overrides the base register or a register loaded by + // another load in our chain, we cannot take this instruction. + bool Overlap = false; + if (isLoadSingle(Opcode)) { + Overlap = (Base == Reg); + if (!Overlap) { + for (const MemOpQueueEntry &E : MemOps) { + if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + } + } + if (!Overlap) { + // Check offset and sort memory operation by offset + // into the current chain. + if (Offset > MemOps.back().Offset) { + MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Reg, Position)); + continue; + } else { + MemOpQueue::iterator MI, ME; + for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) { + if (Offset < MI->Offset) { + // Found a place to insert. + break; + } + if (Offset == MI->Offset) { + // Collision, abort. + MI = ME; + break; + } + } + if (MI != MemOps.end()) { + MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Reg, Position)); + continue; + } + } + } + } + + // Don't advance the iterator; The op will start a new chain next. + MBBI = I; + --Position; + // Fallthrough to look into existing chain. + } else if (MBBI->isDebugValue()) + continue; + + // If we are here then the chain is broken; Extract candidates for a merge. + if (MemOps.size() > 0) { + FormCandidates(MemOps); + // Reset for the next chain. + CurrBase = 0; + CurrOpc = ~0u; + MemOps.clear(); + } + + } + if (MemOps.size() > 0) + FormCandidates(MemOps); + + // Sort candidates so they get processed from end to begin of the basic + // block later; This is necessary for liveness calculation. + auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { + return M0->InsertPos < M1->InsertPos; + }; + std::sort(Candidates.begin(), Candidates.end(), LessThan); + + // Go through list of candidates and merge. + bool Changed = false; + for (const MergeCandidate *Candidate : Candidates) { + if (Candidate->CanMergeToLSMulti) { + MachineInstr *Merged = MergeOpsUpdate(*Candidate); + // Merge preceding/trailing base inc/dec into the merged op. + if (Merged) { + Changed = true; + MergeBaseUpdateLSMultiple(Merged); + } + } else { + assert(Candidate->Instrs.size() == 1); + } + } + Candidates.clear(); + MergeBaseCandidates.clear(); + + return Changed; +} + +bool NDS32LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(*Fn.getFunction())) + return false; + + MF = &Fn; + STI = &static_cast(Fn.getSubtarget()); + TL = STI->getTargetLowering(); + NFI = Fn.getInfo(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + + RegClassInfoValid = false; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= LoadStoreMultipleOpti(MBB); + } + + Allocator.DestroyAll(); + return Modified; +} + +/// Returns an instance of the load / store optimization pass. +FunctionPass *llvm::createNDS32LoadStoreOptimizationPass(bool PreAlloc) { + return new NDS32LoadStoreOpt(); +} Index: lib/Target/NDS32/NDS32Predicate.td =================================================================== --- lib/Target/NDS32/NDS32Predicate.td +++ lib/Target/NDS32/NDS32Predicate.td @@ -227,6 +227,13 @@ let PrintMethod = "printAddrImmOffsetOperand"; } +// A list of registers. Used by load/store multiple. +def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; } +def reglist : Operand { + let MIOperandInfo = (ops GPR:$base, variable_ops); + let PrintMethod = "printRegisterList"; +} + //===----------------------------------------------------------------------===// // Branch/Jump target Operands Predicates Index: lib/Target/NDS32/NDS32TargetMachine.cpp =================================================================== --- lib/Target/NDS32/NDS32TargetMachine.cpp +++ lib/Target/NDS32/NDS32TargetMachine.cpp @@ -23,6 +23,11 @@ #define DEBUG_TYPE "nds32" +static cl::opt +EnableNDS32LoadStoreOpt("nds32-load-store-opt", cl::Hidden, + cl::desc("Enable NDS32 load/store optimization pass"), + cl::init(true)); + extern "C" void LLVMInitializeNDS32Target() { // Register the target. RegisterTargetMachine X(getTheNDS32Target()); @@ -117,6 +122,7 @@ return getTM(); } bool addInstSelector() override; + void addPreSched2() override; }; } // namespace @@ -129,3 +135,10 @@ addPass(createNDS32ISelDag(getNDS32TargetMachine(), getOptLevel())); return false; } + +void NDS32PassConfig::addPreSched2() { + if (getOptLevel() != CodeGenOpt::None) { + if (EnableNDS32LoadStoreOpt) + addPass(createNDS32LoadStoreOptimizationPass()); + } +} Index: test/CodeGen/NDS32/load-store-insns.ll =================================================================== --- test/CodeGen/NDS32/load-store-insns.ll +++ test/CodeGen/NDS32/load-store-insns.ll @@ -1,4 +1,4 @@ -; RUN: llc -mattr=+no-16bit < %s | FileCheck %s +; RUN: llc -mattr=+no-16bit -nds32-load-store-opt=false < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64" target triple = "nds32le---elf" Index: test/CodeGen/NDS32/multiple-load-store.ll =================================================================== --- /dev/null +++ test/CodeGen/NDS32/multiple-load-store.ll @@ -0,0 +1,87 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64" +target triple = "nds32le---elf" + +%struct.tr = type { i32, i32, i32 } + +@.str = private unnamed_addr constant [5 x i8] c"main\00", align 1 +@lsmw5.t = private unnamed_addr constant %struct.tr { i32 1, i32 2, i32 3 }, align 4 + +; Function Attrs: nounwind readonly +define i32 @lsmw1(i8* nocapture readonly %a) local_unnamed_addr #0 { +entry: +; CHECK: smw.adm $sp, [$sp], $sp, 2 + %call = tail call i32 @strcmp(i8* %a, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0)) + ret i32 %call +; CHECK: lmw.bim $sp, [$sp], $sp, 2 +} + +; Function Attrs: nounwind readonly +declare i32 @strcmp(i8* nocapture, i8* nocapture) local_unnamed_addr #1 + +; Function Attrs: norecurse nounwind readnone +define i64 @lsmw2(i64 %sum, i32 %x) local_unnamed_addr #2 { +entry: +; CHECK: smw.adm $r6, [$sp], $r7, 2 + %sext = shl i64 %sum, 32 + %conv1 = ashr exact i64 %sext, 32 + %conv2 = sext i32 %x to i64 + %mul = mul nsw i64 %conv1, %conv2 + %add = add nsw i64 %mul, %sum + ret i64 %add +; CHECK: lmw.bim $r6, [$sp], $r7, 2 +} + +; Function Attrs: nounwind +define i32 @lsmw3(i32 %i, i32 %j, i32 %k, %struct.tr* byval nocapture readonly align 4 %t) local_unnamed_addr #3 { +entry: +; CHECK: smw.ai $r6, [$sp], $r6, 2 + %a = getelementptr inbounds %struct.tr, %struct.tr* %t, i32 0, i32 0 + %0 = load i32, i32* %a, align 4 + %cmp = icmp eq i32 %0, 1 + br i1 %cmp, label %lor.lhs.false, label %if.then + +lor.lhs.false: ; preds = %entry + %b = getelementptr inbounds %struct.tr, %struct.tr* %t, i32 0, i32 1 + %1 = load i32, i32* %b, align 4 + %cmp1 = icmp eq i32 %1, 2 + br i1 %cmp1, label %lor.lhs.false2, label %if.then + +lor.lhs.false2: ; preds = %lor.lhs.false + %c = getelementptr inbounds %struct.tr, %struct.tr* %t, i32 0, i32 2 + %2 = load i32, i32* %c, align 4 + %cmp3 = icmp ne i32 %2, 3 + %cmp5 = icmp ne i32 %i, 4 + %or.cond = or i1 %cmp5, %cmp3 + %cmp7 = icmp ne i32 %j, 5 + %or.cond10 = or i1 %cmp7, %or.cond + %cmp9 = icmp ne i32 %k, 6 + %or.cond11 = or i1 %cmp9, %or.cond10 + br i1 %or.cond11, label %if.then, label %if.end + +if.then: ; preds = %lor.lhs.false, %entry, %lor.lhs.false2 + tail call void @abort() #7 + unreachable + +if.end: ; preds = %lor.lhs.false2 +; CHECK: lmw.ai $r6, [$sp], $r6, 2 + ret i32 undef +} + +; Function Attrs: noreturn +declare void @abort() local_unnamed_addr #4 + +; Function Attrs: noreturn nounwind +define i32 @lsmw5() local_unnamed_addr #6 { +entry: +; CHECK: smw.adm $sp, [$sp], $sp, 2 + %call = tail call i32 bitcast (i32 (...)* @foo to i32 (%struct.tr*, i32, i32, i32)*)(%struct.tr* byval nonnull align 4 @lsmw5.t, i32 4, i32 5, i32 6) #8 +; CHECK: lmw.ai $r1, [$r0], $r2, 0 + tail call void @exit(i32 0) #7 + unreachable +} + +declare i32 @foo(...) local_unnamed_addr #5 + +; Function Attrs: noreturn +declare void @exit(i32) local_unnamed_addr #4 Index: test/CodeGen/NDS32/prolog-endprolog.ll =================================================================== --- test/CodeGen/NDS32/prolog-endprolog.ll +++ test/CodeGen/NDS32/prolog-endprolog.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc -nds32-load-store-opt=false < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64" target triple = "nds32le---elf"