diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -772,6 +772,8 @@ /// Once this function is called, no other functions on this object are /// valid; the loop has been removed. virtual void disposed() = 0; + + virtual unsigned getTripCountReg() { return 0; } }; /// Analyze loop L, which must be a single-basic-block loop, and if the diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -820,6 +820,16 @@ } MachineMemOperand *MMO1 = *LdMI.memoperands_begin(); MachineMemOperand *MMO2 = *MI.memoperands_begin(); + + // Workaround for loop distribution. Verification required. + if (MMO1->getValue() && MMO2->getPseudoValue()) { + if (!MMO2->getPseudoValue()->mayAlias(&MFI)) + continue; + } else if (MMO2->getValue() && MMO1->getPseudoValue()) { + if (!MMO1->getPseudoValue()->mayAlias(&MFI)) + continue; + } + if (!MMO1->getValue() || !MMO2->getValue()) { SDep Dep(Load, SDep::Barrier); Dep.setLatency(1); diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -44,6 +44,7 @@ FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64KCFIPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +FunctionPass *createAArch64LoopDistribute(); ModulePass *createAArch64LowerHomogeneousPrologEpilogPass(); FunctionPass *createAArch64SIMDInstrOptPass(); ModulePass *createAArch64PromoteConstantPass(); @@ -88,6 +89,7 @@ void initializeAArch64GlobalsTaggingPass(PassRegistry &); void initializeAArch64KCFIPass(PassRegistry &); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LoopDistributePass(PassRegistry &); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -223,6 +223,10 @@ MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + + std::unique_ptr + analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; + bool reverseBranchCondition(SmallVectorImpl &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef Cond, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -14,6 +14,7 @@ #include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -24,6 +25,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" @@ -8327,6 +8329,68 @@ return AArch64::BLR; } +namespace { +class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { + MachineInstr *Comp, *CondBranch; + MachineBasicBlock *TBB; + SmallVector BrCond; + const TargetInstrInfo *TII; + +public: + AArch64PipelinerLoopInfo(MachineInstr *Comp, MachineInstr *CondBranch, + MachineBasicBlock *TBB, + const SmallVectorImpl &BrCond) + : Comp(Comp), CondBranch(CondBranch), TBB(TBB), + BrCond(BrCond.begin(), BrCond.end()), TII(CondBranch->getParent() + ->getParent() + ->getSubtarget() + .getInstrInfo()) {} + + bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { + if (MI == Comp || MI == CondBranch) + return true; + return false; + } + + std::optional createTripCountGreaterCondition( + int TC, MachineBasicBlock &MBB, + SmallVectorImpl &Cond) override { + Cond = BrCond; + if (TBB == CondBranch->getParent()) + TII->reverseBranchCondition(Cond); + return {}; + } + + void setPreheader(MachineBasicBlock *NewPreheader) override {} + + void adjustTripCount(int TripCountAdjust) override {} + + void disposed() override {} +}; +} // namespace + +std::unique_ptr +AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + MachineBasicBlock *TBB, *FBB; + SmallVector BrCond; + if (analyzeBranch(*LoopBB, TBB, FBB, BrCond)) + return nullptr; + + MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); + if (CondBranch->getOpcode() != AArch64::Bcc) + return nullptr; + + MachineInstr *Comp; + for (auto I = LoopBB->getFirstTerminator(); I != LoopBB->begin(); --I) { + if (I->findRegisterDefOperandIdx(AArch64::NZCV) != -1) { + Comp = &*I; + } + } + + return std::make_unique(Comp, CondBranch, TBB, + BrCond); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64LoopDistribute.cpp b/llvm/lib/Target/AArch64/AArch64LoopDistribute.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LoopDistribute.cpp @@ -0,0 +1,1183 @@ +//===- AArch64LoopDistribute.cpp - AArch64 Loop Distribute Pass -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A prototype of loop distribution for IPC improvement. +// +// This pass runs before pipelining and splits loops so that they can be +// pipelined more efficiently. Only a simple test program has confirmed +// successfully pipelined. Some improvements are needed for correct +// transformation. It is implemented for AArch64 only to simplify implementation +// and finally we want to make it generic. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64RegisterInfo.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/MathExtras.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-distribute" + +static cl::opt NumRegs("loop-dist-num-regs", + cl::desc("Number of registers."), cl::Hidden, + cl::init(32)); + +namespace { +class AArch64LoopDistribute : public MachineFunctionPass { +public: + static char ID; + + MachineFunction *MF = nullptr; + const MachineLoopInfo *MLI = nullptr; + const TargetInstrInfo *TII = nullptr; + AliasAnalysis *AA; + + AArch64LoopDistribute() : MachineFunctionPass(ID) { + initializeAArch64LoopDistributePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + void distributeLoop(MachineLoop &L); +}; +char AArch64LoopDistribute::ID = 0; +} // namespace + +INITIALIZE_PASS_BEGIN(AArch64LoopDistribute, DEBUG_TYPE, + "AArch64 Loop Distribute", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(AArch64LoopDistribute, DEBUG_TYPE, + "AArch64 Loop Distribute", false, false) + +namespace { +class Distribute : public ScheduleDAGInstrs { + const MCSubtargetInfo &STI; + const MCSchedModel &SM; + MachineRegisterInfo &MRI; + MachineFrameInfo &MFI; + std::unique_ptr PLI; + MachineBasicBlock *OrgPreheader; + + struct EstIIDetail { + int EstimatedII; + int ResMII; + double RequiredParallel; + double PossibleParallel; + int LenCritPath; + int NumRegPerIter; + int NumRegInvariant; + int NumRegOutput; + int NumTmpArray; + void print(raw_ostream &OS) { + OS << "Estimated II: " << EstimatedII << "\n" + << "Resource Min II: " << ResMII << "\n" + << "Required Num Parallel: " << RequiredParallel << "\n" + << "Possible Num Parallel: " << PossibleParallel << "\n" + << "Latency Critical Path Length: " << LenCritPath << "\n" + << "Required Num Registers Per Iter: " << NumRegPerIter << "\n" + << "Required Num Registers For Invariant: " << NumRegInvariant << "\n" + << "Required Num Registers For Output: " << NumRegOutput << "\n" + << "Num Temporary Arrays: " << NumTmpArray << "\n"; + } + }; + + void regPressListSched(std::vector &RPSched); + + /// Handle only specified kind of registers. + bool isTargetRegKind(MachineInstr *MI) { + for (MachineOperand &MO : MI->defs()) { + if (isTargetRegKind(MO.getReg())) + return true; + } + return false; + } + bool isTargetRegKind(const SDep &Dep) { + if (Dep.isAssignedRegDep() && isTargetRegKind(Dep.getReg())) + return true; + return false; + } + bool isTargetRegKind(unsigned Reg) { + return MRI.getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass); + } + int estII(std::vector::iterator B, std::vector::iterator E, + EstIIDetail &Detail); + int maxNumRegPerIter(std::vector::iterator B, + std::vector::iterator E); + int lenCritPath(std::vector::iterator B, + std::vector::iterator E); + int resMII(std::vector::iterator B, + std::vector::iterator E); + int numInvariant(std::vector::iterator B, + std::vector::iterator E); + int numOutRegs(std::vector::iterator B, + std::vector::iterator E); + int numTmpArrays(std::vector::iterator B, + std::vector::iterator E); + void gatherPredSUs(const std::set &Inside, SUnit *Root, + std::set &Result); + unsigned createCounter(MachineBasicBlock *MBB, MachineBasicBlock *Preheader); + void replacePhiBB(MachineInstr *MI, MachineBasicBlock *To, + MachineBasicBlock *From); + MachineInstr *cloneInstr(MachineInstr *MI, MachineBasicBlock *To, + MachineBasicBlock::iterator ToIter, + std::map &VRMap); + unsigned getLoadOpc(unsigned Reg); + unsigned getStoreOpc(unsigned Reg); + std::vector::iterator findDistPoint(std::vector &RPSched); + int countNumTmpLoads(std::vector::iterator B, + std::vector::iterator E); + int countNumTmpStores(std::vector::iterator B, + std::vector::iterator E); + + bool findTripCountReg(Register *LCReg, SUnit **Comp); + void constructBlocks(MachineBasicBlock *&OrgExitBB, MachineBasicBlock *&BB1, + MachineBasicBlock *&BB2, + MachineBasicBlock *&PreheaderBB2, + MachineBasicBlock *&UniqExitBB); + void cloneInstrs(std::vector &RPSched, + std::vector::iterator DistPoint, SUnit *Comp, + MachineBasicBlock *BB1, MachineBasicBlock *BB2, + MachineBasicBlock *PreheaderBB2, + std::map &VRMap1, + std::map &VRMap2); + void gatherRegsStored(std::map &Reg2Offset, + std::vector &RPSched, + std::vector::iterator DistPoint); + void calcTmpArrayOffset(Register TripCount, + std::map &Reg2Offset, + Register &LastAddr); + void allocStackArea(Register ArraySize, MachineBasicBlock *UniqExitBB, + Register &Addr); + void insertTmpLoadStore(MachineBasicBlock *BB1, MachineBasicBlock *BB2, + MachineBasicBlock *PreheaderBB2, + std::map &Reg2Offset, + Register TmpAreaTop, + std::map &VRMap1); + +public: + virtual void schedule() override; + Distribute(MachineFunction &MF, MachineLoop &L, const MachineLoopInfo *MLI) + : ScheduleDAGInstrs(MF, MLI), STI(MF.getSubtarget()), + SM(STI.getSchedModel()), MRI(MF.getRegInfo()), MFI(MF.getFrameInfo()), + OrgPreheader(L.getLoopPreheader()) { + assert(L.getBlocks().size() == 1); + assert(L.getLoopPreheader()); + } +}; +} // namespace + +void AArch64LoopDistribute::distributeLoop(MachineLoop &L) { + if (L.getBlocks().size() != 1 || !L.getLoopPreheader()) + return; + + std::unique_ptr PLI = + TII->analyzeLoopForPipelining(L.getTopBlock()); + + if (!PLI) + return; + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + if (TII->analyzeBranch(*L.getLoopPreheader(), TBB, FBB, Cond)) { + return; + } + if (TII->analyzeBranch(*L.getTopBlock(), TBB, FBB, Cond)) { + return; + } + if (!TII->analyzeLoopForPipelining(L.getTopBlock())) { + return; + } + + MachineBasicBlock *MBB = L.getTopBlock(); + LLVM_DEBUG(MBB->dump();); + unsigned Size = MBB->size(); + for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(), + E = MBB->instr_end(); + I != E; ++I, --Size) + ; + Distribute Dist(*MF, L, MLI); + Dist.startBlock(L.getTopBlock()); + auto Iter = MBB->getFirstTerminator(); + Dist.enterRegion(L.getTopBlock(), MBB->getFirstNonDebugInstr(), Iter, Size); + Dist.buildSchedGraph(AA); + Dist.dump(); + Dist.schedule(); +} + +Register getPhiLoopReg(MachineInstr &Phi) { + assert(Phi.getNumOperands() == 5); + int I; + if (Phi.getOperand(2).getMBB() == Phi.getParent()) + I = 1; + else + I = 3; + return Phi.getOperand(I).getReg(); +} + +Register getPhiInitReg(MachineInstr &Phi) { + assert(Phi.getNumOperands() == 5); + int I; + if (Phi.getOperand(2).getMBB() != Phi.getParent()) + I = 1; + else + I = 3; + return Phi.getOperand(I).getReg(); +} + +unsigned Distribute::getLoadOpc(unsigned Reg) { + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + switch (TRI->getSpillSize(*RC)) { + case 1: + if (AArch64::FPR8RegClass.hasSubClassEq(RC)) + return AArch64::LDRBui; + break; + case 2: + if (AArch64::FPR16RegClass.hasSubClassEq(RC)) + return AArch64::LDRHui; + break; + case 4: + if (AArch64::FPR32RegClass.hasSubClassEq(RC)) + return AArch64::LDRSui; + break; + case 8: + if (AArch64::FPR64RegClass.hasSubClassEq(RC)) + return AArch64::LDRDui; + break; + case 16: + if (AArch64::FPR128RegClass.hasSubClassEq(RC)) + return AArch64::LDRQui; + break; + } + std::abort(); +} + +unsigned Distribute::getStoreOpc(unsigned Reg) { + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + switch (TRI->getSpillSize(*RC)) { + case 1: + if (AArch64::FPR8RegClass.hasSubClassEq(RC)) + return AArch64::STRBui; + break; + case 2: + if (AArch64::FPR16RegClass.hasSubClassEq(RC)) + return AArch64::STRHui; + break; + case 4: + if (AArch64::FPR32RegClass.hasSubClassEq(RC)) + return AArch64::STRSui; + break; + case 8: + if (AArch64::FPR64RegClass.hasSubClassEq(RC)) + return AArch64::STRDui; + break; + case 16: + if (AArch64::FPR128RegClass.hasSubClassEq(RC)) + return AArch64::STRQui; + break; + } + std::abort(); +} + +/// Repeatedly gather preceding nodes from the Root, stopping when it reaches +/// StopNodes +void Distribute::gatherPredSUs(const std::set &StopNodes, SUnit *Root, + std::set &Result) { + std::set Processing; + Processing.insert(Root); + Result.insert(Root); + while (Processing.size()) { + SUnit *SU = *Processing.begin(); + Processing.erase(SU); + if (SU->getInstr()->isPHI()) { + SUnit *LoopPhi = getSUnit(MRI.getVRegDef(getPhiLoopReg(*SU->getInstr()))); + if (StopNodes.count(LoopPhi) || Result.count(LoopPhi)) + continue; + assert(LoopPhi); + Result.insert(LoopPhi); + Processing.insert(LoopPhi); + continue; + } + for (SDep &Pred : SU->Preds) { + SUnit *PredSU = Pred.getSUnit(); + if (PredSU->isBoundaryNode() || StopNodes.count(PredSU) || + Result.count(PredSU)) + continue; + Result.insert(PredSU); + Processing.insert(PredSU); + } + } +} + +/// Create a loop counter starting from 0 and increasing by 1. +unsigned Distribute::createCounter(MachineBasicBlock *MBB, + MachineBasicBlock *Preheader) { + Register Zero = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*Preheader, Preheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::MOVi64imm), Zero) + .addImm(0); + Register PhiReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + Register LoopReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*MBB, MBB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), + PhiReg) + .addReg(Zero) + .addMBB(Preheader) + .addReg(LoopReg) + .addMBB(MBB); + BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ADDXri), LoopReg) + .addReg(PhiReg) + .addImm(1) + .addImm(0); + return PhiReg; +} + +void Distribute::replacePhiBB(MachineInstr *MI, MachineBasicBlock *To, + MachineBasicBlock *From) { + for (MachineOperand &MO : MI->operands()) + if (MO.isMBB() && MO.getMBB() == From) + MO.setMBB(To); +} + +MachineInstr *Distribute::cloneInstr(MachineInstr *MI, MachineBasicBlock *To, + MachineBasicBlock::iterator ToIter, + std::map &VRMap) { + MachineInstr *NewMI = MF.CloneMachineInstr(MI); + for (MachineOperand &MO : NewMI->defs()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + Register Reg = MO.getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + Register NewReg = MRI.createVirtualRegister(RC); + VRMap[Reg] = NewReg; + MO.setReg(NewReg); + } + if (NewMI->isPHI()) + for (MachineOperand &MO : NewMI->operands()) + if (MO.isMBB() && MO.getMBB() == MI->getParent()) + MO.setMBB(To); + To->insert(ToIter, NewMI); + return NewMI; +} + +/// Return the division point that minimizes II +std::vector::iterator +Distribute::findDistPoint(std::vector &RPSched) { + EstIIDetail Detail1, Detail2; + int MinII = estII(RPSched.begin(), RPSched.end(), Detail1); + LLVM_DEBUG({ + dbgs() << "Original: \n"; + Detail1.print(dbgs()); + dbgs() << "\n"; + }); + + int MinDiffII = MinII; + auto DistPoint = RPSched.end(); + for (auto I = RPSched.begin(); I != RPSched.end(); ++I) { + EstIIDetail DetailTmp1, DetailTmp2; + int II1 = estII(RPSched.begin(), I, DetailTmp1); + if (II1 == 0) + continue; + int II2 = estII(I, RPSched.end(), DetailTmp2); + if (II2 == 0) + continue; + int II = II1 + II2; + + if (II < MinII || + (II == MinII && DetailTmp1.NumTmpArray < Detail1.NumTmpArray) || + (II == MinII && DetailTmp2.NumTmpArray == Detail1.NumTmpArray && + std::abs(II1 - II2) < MinDiffII)) { + MinII = II; + DistPoint = I; + MinDiffII = std::abs(II1 - II2); + Detail1 = DetailTmp1; + Detail2 = DetailTmp2; + } + } + + LLVM_DEBUG({ + dbgs() << "First Block: \n"; + Detail1.print(dbgs()); + for (auto I = RPSched.begin(); I != DistPoint; ++I) + (*I)->getInstr()->dump(); + dbgs() << "\n"; + + dbgs() << "Second Block: \n"; + Detail2.print(dbgs()); + for (auto I = DistPoint; I != RPSched.end(); ++I) + (*I)->getInstr()->dump(); + dbgs() << "\n"; + }); + + return DistPoint; +} + +/// Returns a register representing the trip count. +/// A simple implementation that may return an incorrect result. +bool Distribute::findTripCountReg(Register *TripCount, SUnit **Comp) { + for (auto I = BB->rbegin(); I != BB->rend(); ++I) { + if (I->isCompare()) { + *Comp = getSUnit(&*I); + MachineInstr *CompMI = (*Comp)->getInstr(); + MachineOperand &TC = CompMI->getOperand(1); + if (!TC.isReg()) + return false; + MachineInstr *Phi = MRI.getVRegDef(TC.getReg()); + if (!Phi->isPHI()) + return false; + *TripCount = getPhiInitReg(*Phi); + return true; + } + } + return false; +} + +/// Insert new blocks and branches and modify the CFG. +void Distribute::constructBlocks(MachineBasicBlock *&OrgExitBB, + MachineBasicBlock *&BB1, + MachineBasicBlock *&BB2, + MachineBasicBlock *&PreheaderBB2, + MachineBasicBlock *&UniqExitBB) { + // OrgPreheader: + // b BB1 + // + // BB1: + // b.cc BB1 + // b PreheaderBB2 + // + // PreheaderBB2: + // b BB2 + // + // BB2: + // b.cc BB2 + // b UniqExitBB + // + // UniqExitBB: + // b OrgExitBB + + BB1 = MF.CreateMachineBasicBlock(BB->getBasicBlock()); + BB2 = MF.CreateMachineBasicBlock(BB->getBasicBlock()); + PreheaderBB2 = MF.CreateMachineBasicBlock(BB->getBasicBlock()); + UniqExitBB = MF.CreateMachineBasicBlock(BB->getBasicBlock()); + MF.insert(BB->getIterator(), BB1); + MF.insert(BB->getIterator(), PreheaderBB2); + MF.insert(BB->getIterator(), BB2); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + TII->analyzeBranch(*OrgPreheader, TBB, FBB, Cond); + assert(!FBB); + if (TBB) { + TII->removeBranch(*OrgPreheader); + TII->insertUnconditionalBranch(*OrgPreheader, BB1, DebugLoc()); + } + TII->analyzeBranch(*BB, TBB, FBB, Cond); + TII->insertBranch(*BB1, TBB == BB ? BB1 : PreheaderBB2, + TBB == BB ? PreheaderBB2 : BB1, Cond, DebugLoc()); + TII->insertBranch(*BB2, TBB == BB ? BB2 : UniqExitBB, + TBB == BB ? UniqExitBB : BB2, Cond, DebugLoc()); + + if (!FBB) + FBB = BB->getNextNode(); + + if (TBB == BB) + OrgExitBB = FBB; + else + OrgExitBB = TBB; + + MF.insert(OrgExitBB->getIterator(), UniqExitBB); + TII->insertUnconditionalBranch(*UniqExitBB, OrgExitBB, DebugLoc()); + TII->insertUnconditionalBranch(*PreheaderBB2, BB2, DebugLoc()); + + OrgPreheader->replaceSuccessor(BB, BB1); + BB1->addSuccessor(BB1); + BB1->addSuccessor(PreheaderBB2); + PreheaderBB2->addSuccessor(BB2); + BB2->addSuccessor(BB2); + BB2->addSuccessor(UniqExitBB); + BB->removeSuccessor(OrgExitBB); + UniqExitBB->addSuccessor(OrgExitBB); +} + +/// Clone instructions to new blocks. +void Distribute::cloneInstrs(std::vector &RPSched, + std::vector::iterator DistPoint, + SUnit *Comp, MachineBasicBlock *BB1, + MachineBasicBlock *BB2, + MachineBasicBlock *PreheaderBB2, + std::map &VRMap1, + std::map &VRMap2) { + + // Clone instructions for each division + for (auto I = RPSched.begin(); I != DistPoint; ++I) + cloneInstr((*I)->getInstr(), BB1, BB1->getFirstTerminator(), VRMap1); + for (auto I = DistPoint; I != RPSched.end(); ++I) + cloneInstr((*I)->getInstr(), BB2, BB2->getFirstTerminator(), VRMap2); + + std::set BB1SUs(RPSched.begin(), DistPoint), + BB2SUs(DistPoint, RPSched.end()); + std::set CloneSUs1, CloneSUs2; + + // Gather instructions for loop control + if (!BB1SUs.count(Comp)) + gatherPredSUs(BB1SUs, Comp, CloneSUs1); + if (!BB2SUs.count(Comp)) + gatherPredSUs(BB2SUs, Comp, CloneSUs2); + + // Gather instructions contained in a cycle that crosses a division. + // Assume they are all trivial instructions. + for (auto I = RPSched.begin(); I != DistPoint; ++I) { + for (auto Dep : (*I)->Preds) { + if (Dep.getSUnit()->isBoundaryNode() || + !Dep.getSUnit()->getInstr()->isPHI()) + continue; + gatherPredSUs(BB1SUs, Dep.getSUnit(), CloneSUs1); + } + } + for (auto I = DistPoint; I != RPSched.end(); ++I) { + for (auto Dep : (*I)->Preds) { + if (Dep.getSUnit()->isBoundaryNode()) + continue; + if (Dep.getSUnit()->getInstr()->isPHI()) + gatherPredSUs(BB2SUs, Dep.getSUnit(), CloneSUs2); + else if (!BB2SUs.count(Dep.getSUnit()) && !isTargetRegKind(Dep)) + gatherPredSUs(BB2SUs, Dep.getSUnit(), CloneSUs2); + } + } + + // Clone gathered instructions. + for (SUnit *I : RPSched) + if (CloneSUs1.count(I)) + cloneInstr(I->getInstr(), BB1, BB1->getFirstTerminator(), VRMap1); + for (SUnit *I : CloneSUs1) + if (I->getInstr()->isPHI()) + cloneInstr(I->getInstr(), BB1, BB1->begin(), VRMap1); + for (SUnit *I : RPSched) + if (CloneSUs2.count(I)) + cloneInstr(I->getInstr(), BB2, BB2->getFirstTerminator(), VRMap2); + for (SUnit *I : CloneSUs2) + if (I->getInstr()->isPHI()) { + MachineInstr *NewMI = + cloneInstr(I->getInstr(), BB2, BB2->begin(), VRMap2); + replacePhiBB(NewMI, PreheaderBB2, OrgPreheader); + } + + // Replace references to new VRs. + for (auto &I : VRMap2) + // References outside the loop use the result of the division on the lower + // side. + for (auto &MO : llvm::make_early_inc_range(MRI.use_operands(I.first))) + if (MO.getParent()->getParent() != BB1) + MO.setReg(I.second); + for (auto &I : VRMap1) + for (auto &MO : llvm::make_early_inc_range(MRI.use_operands(I.first))) + if (MO.getParent()->getParent() != BB2) + MO.setReg(I.second); +} + +/// Gather registers across the division of the target register kind. +void Distribute::gatherRegsStored(std::map &Reg2Offset, + std::vector &RPSched, + std::vector::iterator DistPoint) { + std::set BB1SUs(RPSched.begin(), DistPoint); + for (auto I = RPSched.begin(); I != DistPoint; ++I) { + for (auto Dep : (*I)->Succs) + if (!Dep.getSUnit()->isBoundaryNode() && !BB1SUs.count(Dep.getSUnit()) && + isTargetRegKind(Dep)) { + Register Reg = Dep.getReg(); + for (auto Def : (*I)->getInstr()->defs()) + if (Def.isReg() && Def.getReg() == Reg) + Reg2Offset.insert({Def.getReg(), 0}); + } + } +} + +/// Calculate offsets of each temporary arrays. +void Distribute::calcTmpArrayOffset(Register TripCount, + std::map &Reg2Offset, + Register &ArraySize) { + Register TripCountCommon = + MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::COPY), TripCountCommon) + .addReg(TripCount); + + unsigned AlignSize = 8; + Register Zero = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::MOVi64imm), Zero) + .addImm(0); + Register Cur = Zero; + for (auto &I : Reg2Offset) { + // offset0 = 0 + // offset1 = sizeof(type0)*trip_count + // offset2 = (offset1+7)&-8 + sizeof(type1)*trip_count + // ... + I.second = Cur; + const TargetRegisterClass *RC = MRI.getRegClass(I.first); + unsigned Size = TRI->getRegSizeInBits(*RC) / 8; + Register SizeReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::MOVi64imm), SizeReg) + .addImm(Size); + + Register MultReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::MADDXrrr), MultReg) + .addReg(TripCountCommon) + .addReg(SizeReg) + .addReg(Cur); + + Register AddReg2 = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ADDXri), AddReg2) + .addReg(MultReg) + .addImm(AlignSize - 1) + .addImm(0); + + Register AndReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ANDXri), AndReg) + .addReg(AddReg2) + .addImm(AArch64_AM::encodeLogicalImmediate(UINT64_MAX - (AlignSize - 1), + 64)); + Cur = AndReg; + } + ArraySize = Cur; +} + +/// Allocate stack area for temporary arrays. +void Distribute::allocStackArea(Register ArraySize, + MachineBasicBlock *UniqExitBB, Register &Addr) { + // Need to confirm the correct method. + MFI.CreateVariableSizedObject(Align(1), nullptr); + MFI.setAdjustsStack(true); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ADJCALLSTACKDOWN)) + .addImm(0) + .addImm(0); + Register CopySP = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::COPY), CopySP) + .addReg(AArch64::SP); + Register SubSP = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::SUBXrr), SubSP) + .addReg(CopySP) + .addReg(ArraySize); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::COPY), AArch64::SP) + .addReg(SubSP); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ADJCALLSTACKUP)) + .addImm(0) + .addImm(0); + + BuildMI(*UniqExitBB, UniqExitBB->getFirstNonPHI(), DebugLoc(), + TII->get(AArch64::COPY), AArch64::SP) + .addReg(CopySP); + + Addr = SubSP; +} + +/// Insert load and store instructions for temporary arrays. +void Distribute::insertTmpLoadStore(MachineBasicBlock *BB1, + MachineBasicBlock *BB2, + MachineBasicBlock *PreheaderBB2, + std::map &Reg2Offset, + Register TmpAreaTop, + std::map &VRMap1) { + Register FirstCounter = createCounter(BB1, OrgPreheader); + Register SecondCounter = createCounter(BB2, PreheaderBB2); + std::map LoadVRMap; + unsigned Offset = 0; + for (auto &I : Reg2Offset) { + // Array0 = TmpArrayTop + Offset0 + // FirstCounter = 0 + // BB1: + // Reg0_1 = ... + // store(Reg0_1, Array0+FirstCounter*Size0) + // FirstCounter += 1 + // ... + // SecondCounter = 0 + // BB2: + // Reg0_2 = load(Array0+SecondCounter*Size0) + // ... = ... Reg0_2 ... + // SecondCounter += 1 + // ... + + Register Array = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ADDXrr), Array) + .addReg(I.second) + .addReg(TmpAreaTop); + + Register Addr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + unsigned Shift = + llvm::Log2_64(TRI->getRegSizeInBits(*MRI.getRegClass(I.first)) / 8); + BuildMI(*BB1, BB1->getFirstTerminator(), DebugLoc(), + TII->get(AArch64::ADDXrs), Addr) + .addReg(Array) + .addReg(FirstCounter) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); + MachineInstr *Store = BuildMI(*BB1, BB1->getFirstTerminator(), DebugLoc(), + TII->get(getStoreOpc(I.first))) + .addReg(VRMap1[I.first]) + .addReg(Addr) + .addImm(0); + + Addr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + Register Loaded = MRI.createVirtualRegister(MRI.getRegClass(I.first)); + MachineInstr *Load = BuildMI(*BB2, BB2->getFirstNonPHI(), DebugLoc(), + TII->get(getLoadOpc(Loaded)), Loaded) + .addReg(Addr) + .addImm(0); + BuildMI(*BB2, BB2->getFirstNonPHI(), DebugLoc(), TII->get(AArch64::ADDXrs), + Addr) + .addReg(Array) + .addReg(SecondCounter) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); + + // Workaround to cause later analysis to determine that there are no + // aliases. Requires modification to correct pointer information. + Load->addMemOperand( + MF, + MF.getMachineMemOperand( + MachinePointerInfo(MF.getPSVManager().getConstantPool(), Offset), + MachineMemOperand::MOLoad, 8, Align(8))); + Store->addMemOperand( + MF, + MF.getMachineMemOperand( + MachinePointerInfo(MF.getPSVManager().getConstantPool(), Offset), + MachineMemOperand::MOStore, 8, Align(8))); + + LoadVRMap[I.first] = Loaded; + Offset += 8; + } + + for (auto &I : LoadVRMap) + for (auto &MO : llvm::make_early_inc_range(MRI.use_operands(I.first))) + if (MO.getParent()->getParent() == BB2) + MO.setReg(I.second); +} + +/// Performe loop distribution. +void Distribute::schedule() { + SUnit *Comp; + Register TripCount; + if (!findTripCountReg(&TripCount, &Comp)) + return; + + // Tentatively, schedule the instruction to minimize register pressure and + // divide it into two parts. Need some improvements: + // * Divide more flexibly based on DFG. + // * Preserve memory dependencies. + // * Ensure that cycles containing costly instructions do not cross the + // division. + // * Repeat division. + + std::vector RPSched; + regPressListSched(RPSched); + + std::vector::iterator DistPoint = findDistPoint(RPSched); + if (DistPoint == RPSched.begin()) + return; + + MachineBasicBlock *OrgExitBB, *BB1, *BB2, *PreheaderBB2, *UniqExitBB; + constructBlocks(OrgExitBB, BB1, BB2, PreheaderBB2, UniqExitBB); + + std::map VRMap1, VRMap2; + cloneInstrs(RPSched, DistPoint, Comp, BB1, BB2, PreheaderBB2, VRMap1, VRMap2); + + std::map Reg2Offset; + gatherRegsStored(Reg2Offset, RPSched, DistPoint); + + Register ArraySize; + calcTmpArrayOffset(TripCount, Reg2Offset, ArraySize); + + Register TmpAreaTop; + allocStackArea(ArraySize, UniqExitBB, TmpAreaTop); + + insertTmpLoadStore(BB1, BB2, PreheaderBB2, Reg2Offset, TmpAreaTop, VRMap1); + + MF.erase(BB); +} + +/// List scheduling to minimize register pressure +void Distribute::regPressListSched(std::vector &RPSched) { + std::set Ready, Scheduled; + for (SUnit &SU : SUnits) { + if (SU.getInstr()->isPHI() || SU.getInstr()->isBranch()) { + Scheduled.insert(&SU); + continue; + } + bool IsReady = true; + for (SDep &Pred : SU.Preds) { + if (Scheduled.count(Pred.getSUnit()) == 0) { + IsReady = false; + break; + } + } + if (IsReady) + Ready.insert(&SU); + } + + while (Scheduled.size() < SUnits.size()) { + int MaxNumDead; + SUnit *Candidate = nullptr; + for (SUnit *SU : Ready) { + int NumDead = 0; + for (SDep &Pred : SU->Preds) { + bool Dead = true; + for (SDep &User : Pred.getSUnit()->Succs) { + if (User.getSUnit() != SU && Scheduled.count(User.getSUnit()) == 0) { + Dead = false; + break; + } + if (Dead) + NumDead++; + } + if (!Candidate || MaxNumDead < NumDead) { + MaxNumDead = NumDead; + Candidate = SU; + } + } + } + RPSched.push_back(Candidate); + Scheduled.insert(Candidate); + Ready.erase(Candidate); + for (SDep &Succ : Candidate->Succs) { + if (Succ.getSUnit()->isBoundaryNode() || Scheduled.count(Succ.getSUnit())) + continue; + bool IsReady = true; + for (SDep &Pred : Succ.getSUnit()->Preds) { + if (!Pred.getSUnit()->isBoundaryNode() && + Scheduled.count(Pred.getSUnit()) == 0) { + IsReady = false; + break; + } + } + if (IsReady) + Ready.insert(Succ.getSUnit()); + } + } +} + +int Distribute::numInvariant(std::vector::iterator B, + std::vector::iterator E) { + std::set Invariant; + for (auto I = B; I != E; ++I) { + for (MachineOperand &MO : (*I)->getInstr()->uses()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + if (MRI.getVRegDef(MO.getReg())->getParent() != BB) + Invariant.insert(MO.getReg()); + } + } + return Invariant.size(); +} + +int Distribute::numOutRegs(std::vector::iterator B, + std::vector::iterator E) { + int Res = 0; + for (auto I = B; I != E; ++I) { + for (MachineOperand &MO : (*I)->getInstr()->defs()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + if (MRI.getVRegDef(MO.getReg())->getParent() != BB) + Res++; + } + } + return Res; +} + +int Distribute::lenCritPath(std::vector::iterator B, + std::vector::iterator E) { + std::set Ready, Ranged(B, E); + std::map Deapth; + for (auto I = B; I != E; ++I) { + bool IsReady = true; + for (SDep &Pred : (*I)->Preds) { + if (Ranged.count(Pred.getSUnit())) { + IsReady = false; + break; + } + if (IsReady) + Ready.insert(*I); + } + } + + unsigned MaxDeapth = 0; + while (Ready.size()) { + SUnit *SU = *Ready.begin(); + Ready.erase(SU); + unsigned Max = 0; + for (SDep &Pred : SU->Preds) { + auto I = Deapth.find(Pred.getSUnit()); + if (I == Deapth.end()) + continue; + Max = std::max(Max, I->second + Pred.getLatency()); + } + Deapth[SU] = Max; + MaxDeapth = std::max(MaxDeapth, Max); + for (SDep &Succ : SU->Succs) { + if (Ranged.count(Succ.getSUnit()) == 0) + continue; + bool IsReady = true; + for (SDep &SPred : Succ.getSUnit()->Preds) { + if (Ranged.count(SPred.getSUnit()) && !Deapth.count(SPred.getSUnit())) { + IsReady = false; + break; + } + } + if (IsReady) + Ready.insert(Succ.getSUnit()); + } + } + return MaxDeapth; +} + +/// Calculate maximum number of registers consumed per iteration. +/// The number of registers consumed across iterations is not counted. +int Distribute::maxNumRegPerIter(std::vector::iterator B, + std::vector::iterator E) { + std::set Past, Ranged(B, E); + int MaxReg, CurReg; + MaxReg = CurReg = 0; + bool SpillAndDeadPrev = false; + for (auto I = B; I != E; ++I) { + Past.insert(*I); + if (SpillAndDeadPrev) { + CurReg--; + SpillAndDeadPrev = false; + } + if (isTargetRegKind((*I)->getInstr())) { + CurReg++; + bool HasNoUser = true; + for (SDep &D : (*I)->Succs) { + if (Ranged.count(D.getSUnit())) { + HasNoUser = false; + break; + } + } + if (HasNoUser) + // A case where it is only used in the other division. + SpillAndDeadPrev = true; + } + for (SDep &Pred : (*I)->Preds) { + if (!isTargetRegKind(Pred)) + continue; + if (Ranged.count(Pred.getSUnit()) == 0) { + bool IsLoaded = false; + for (SDep &PSucc : Pred.getSUnit()->Succs) { + if (Past.count(PSucc.getSUnit()) && PSucc.getSUnit() != *I) { + IsLoaded = true; + break; + } + } + if (!IsLoaded) + // A case where the definition in the other division is first + // referenced. Assume other references use the same loaded register. + CurReg++; + } + bool IsDead = true; + for (SDep &PSucc : Pred.getSUnit()->Succs) { + if (Past.count(PSucc.getSUnit()) == 0 && + Ranged.count(PSucc.getSUnit())) { + IsDead = false; + break; + } + } + if (IsDead) + CurReg--; + } + MaxReg = std::max(MaxReg, CurReg); + } + return MaxReg; +} + +static void consumeResources(SmallVectorImpl &ResourceCount, + int &NumMops, const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI, int Times = 1) { + NumMops += SCDesc.NumMicroOps * Times; + for (const MCWriteProcResEntry &PRE : + make_range(STI.getWriteProcResBegin(&SCDesc), + STI.getWriteProcResEnd(&SCDesc))) { + ResourceCount[PRE.ProcResourceIdx] += PRE.Cycles * Times; + } +} + +int Distribute::countNumTmpLoads(std::vector::iterator B, + std::vector::iterator E) { + std::set SetSUs(B, E); + std::set CrossRegs; + for (auto I = B; I != E; ++I) + for (SDep &SD : (*I)->Preds) + if (isTargetRegKind(SD) && !SetSUs.count(SD.getSUnit())) + CrossRegs.insert(SD.getReg()); + return CrossRegs.size(); +} + +int Distribute::countNumTmpStores(std::vector::iterator B, + std::vector::iterator E) { + std::set SetSUs(B, E); + std::set CrossRegs; + for (auto I = B; I != E; ++I) { + for (SDep &SD : (*I)->Succs) + if (isTargetRegKind(SD) && !SetSUs.count(SD.getSUnit())) + CrossRegs.insert(SD.getReg()); + } + return CrossRegs.size(); +} + +/// Calculate resource Min II. +int Distribute::resMII(std::vector::iterator B, + std::vector::iterator E) { + // Same method as MachinePipeliner. + // Instructions duplicated from the other division and address calculations + // for temporary arrays are assumed to be trivial and ignored. + + int NumMops = 0; + SmallVector ResourceCount(SM.getNumProcResourceKinds()); + for (auto I = B; I != E; ++I) { + SUnit &SU = **I; + if (TII->isZeroCost(SU.getInstr()->getOpcode())) + continue; + + const MCSchedClassDesc *SCDesc = getSchedClass(&SU); + if (!SCDesc->isValid()) + continue; + + consumeResources(ResourceCount, NumMops, *SCDesc, STI); + } + + // Consider load/store of temporary arrays. + consumeResources(ResourceCount, NumMops, + *STI.getSchedModel().getSchedClassDesc( + TII->get(AArch64::LDRDui).getSchedClass()), + STI, countNumTmpLoads(B, E)); + consumeResources(ResourceCount, NumMops, + *STI.getSchedModel().getSchedClassDesc( + TII->get(AArch64::STRDui).getSchedClass()), + STI, countNumTmpStores(B, E)); + + int Result = (NumMops + SM.IssueWidth - 1) / SM.IssueWidth; + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc *Desc = SM.getProcResource(I); + int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits; + if (Cycles > Result) + Result = Cycles; + } + return Result; +} + +/// Estimate the smallest II satisfying register constraints in the division. +int Distribute::estII(std::vector::iterator B, + std::vector::iterator E, EstIIDetail &Detail) { + // Consider II as the total latency per iteration divided by the number of + // overlapping iterations. The number of iterations that can overlap is + // calculated as the number of registers in the architecture divided by the + // maximum register consumption per iteration. This calculation may be + // underestimated because the areas with maximum register pressure do not + // always overlap. + + // TODO: RecMII is not considered. + + if (B == E) + return 0; + Detail.NumRegPerIter = maxNumRegPerIter(B, E); + if (Detail.NumRegPerIter == 0) + return 0; + Detail.NumRegInvariant = numInvariant(B, E); + Detail.NumRegOutput = numOutRegs(B, E); + Detail.PossibleParallel = + (double)(NumRegs - Detail.NumRegInvariant - Detail.NumRegOutput) / + Detail.NumRegPerIter; + Detail.ResMII = resMII(B, E); + if (Detail.ResMII == 0) + return 0; + Detail.LenCritPath = lenCritPath(B, E); + Detail.RequiredParallel = (double)Detail.LenCritPath / Detail.ResMII; + Detail.NumTmpArray = countNumTmpLoads(B, E) + countNumTmpStores(B, E); + if (Detail.RequiredParallel <= 1) + // No pipelining required. + Detail.EstimatedII = Detail.ResMII; + else if (Detail.PossibleParallel > Detail.RequiredParallel) + // Can achieve ResMII. + Detail.EstimatedII = + std::ceil(Detail.LenCritPath / Detail.RequiredParallel); + else + // Need to pipeline loosely due to lack of registers. + Detail.EstimatedII = + std::ceil(Detail.LenCritPath / Detail.PossibleParallel); + return Detail.EstimatedII; +} + +bool AArch64LoopDistribute::runOnMachineFunction(MachineFunction &Mf) { + MF = &Mf; + MLI = &getAnalysis(); + TII = MF->getSubtarget().getInstrInfo(); + AA = &getAnalysis().getAAResults(); + + for (const auto &L : *MLI) + distributeLoop(*L); + + return false; +} + +void AArch64LoopDistribute::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +FunctionPass *llvm::createAArch64LoopDistribute() { + return new AArch64LoopDistribute(); +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -191,6 +191,9 @@ bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } + bool enableMachinePipeliner() const override; + bool useDFAforSMS() const override { return false; } + /// Returns ARM processor family. /// Avoid this function! CPU specifics should be kept local to this class /// and preferably modeled with SubtargetFeatures or properties in diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -68,6 +68,11 @@ ForceStreamingCompatibleSVE("force-streaming-compatible-sve", cl::init(false), cl::Hidden); +static cl::opt + EnableMachinePipeliner("aarch64-enable-pipeliner", + cl::desc("Enable Machine Pipeliner for AArch64"), + cl::init(false), cl::Hidden); + unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) return OverrideVectorInsertExtractBaseCost; @@ -479,3 +484,7 @@ } return false; } + +bool AArch64Subtarget::enableMachinePipeliner() const { + return getSchedModel().hasInstrSchedModel() && EnableMachinePipeliner; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -172,6 +172,10 @@ cl::desc("Enable the AArch64 branch target pass"), cl::init(true)); +static cl::opt EnableLoopDist("aarch64-enable-loop-dist", + cl::desc("Enable loop distribution"), + cl::init(false), cl::Hidden); + static cl::opt SVEVectorBitsMaxOpt( "aarch64-sve-vector-bits-max", cl::desc("Assume SVE vector registers are at most this big, " @@ -239,6 +243,7 @@ initializeAArch64LowerHomogeneousPrologEpilogPass(*PR); initializeAArch64DAGToDAGISelPass(*PR); initializeAArch64GlobalsTaggingPass(*PR); + initializeAArch64LoopDistributePass(*PR); } //===----------------------------------------------------------------------===// @@ -751,6 +756,11 @@ // be register coalescer friendly. addPass(&PeepholeOptimizerID); } + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDist) { + addPass(createAArch64LoopDistribute()); + } + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(&MachinePipelinerID); } void AArch64PassConfig::addPostRegAlloc() { diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -65,6 +65,7 @@ AArch64InstrInfo.cpp AArch64KCFI.cpp AArch64LoadStoreOptimizer.cpp + AArch64LoopDistribute.cpp AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MachineScheduler.cpp