Index: include/llvm/CodeGen/MachinePipeliner.h =================================================================== --- include/llvm/CodeGen/MachinePipeliner.h +++ include/llvm/CodeGen/MachinePipeliner.h @@ -40,11 +40,13 @@ #ifndef LLVM_LIB_CODEGEN_MACHINEPIPELINER_H #define LLVM_LIB_CODEGEN_MACHINEPIPELINER_H +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/TargetInstrInfo.h" - +#include namespace llvm { class NodeSet; @@ -61,6 +63,7 @@ const MachineDominatorTree *MDT = nullptr; const InstrItineraryData *InstrItins; const TargetInstrInfo *TII = nullptr; + const MachineBranchProbabilityInfo *MBPI = nullptr; RegisterClassInfo RegClassInfo; bool disabledByPragma = false; unsigned II_setByPragma = 0; @@ -93,6 +96,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -203,6 +207,8 @@ Mutations.push_back(llvm::make_unique()); } + MachinePipeliner &getPass() { return Pass; } + void schedule() override; void finishBlock() override; @@ -658,6 +664,146 @@ void dump() const; }; +//===----------------------------------------------------------------------===// +// CFG generation support +//===----------------------------------------------------------------------===// + +// A CFG wraps CFGBlocks and Stages. +class CFG; +// A CFGBlock represents one basic block in the generated CFG. It contains +// Stages. +class CFGBlock; + +// A Stage lives inside a CFGBlock and contains a clone of one stage of the +// schedule. +class Stage { +public: + // Create a Stage within the given CFGBlock. The stage will be fed the given + // input stages, which must be either zero, one or two in size. A placeholder + // nullptr can be given for the last stage if it is not yet defined. + Stage(CFGBlock *Block, std::initializer_list Inputs, + unsigned StageIdx); + + // When nullptr was given as the last input in the constructor, this method + // sets the last input. + void setLastInput(Stage *Input) { + assert(!Inputs.back()); + Inputs.back() = Input; + } + + // Perform remapping of the rewritten instructions. All inputs must be + // non-nullptr at this time. + void finalize(); + + // Remap all registers used outside the original loop. + void rewriteUsesOutsideOfLoop(); + + const DenseMap &remaps() { return PostRemaps; } + MachineBasicBlock *getMBB() { return MBB; } + CFG &getContext() { return Ctx; } + ArrayRef getInstrs() const { return Instrs; } + +private: + CFG &Ctx; + CFGBlock *Block; + // The block we will write our instructions into. + MachineBasicBlock *MBB; + // The index of this stage. + unsigned StageIdx; + // The inputs to this stage. The last input (if Inputs.size() == 2) may be + // nullptr until finalize() is called. + SmallVector Inputs; + // Our generated PHI instructions for collecting incoming values. + SmallVector Phis; + // Our generated instructions. + SmallVector Instrs; + // Map of original register to rewritten register on entry and including + // defs within this stage. + DenseMap Remaps; + // Remaps, but with loop-carried PHI values updated. + DenseMap PostRemaps; + // Map from generated PHI to its pre-rewrite register. + DenseMap PhiRegs; +}; + +// A CFGBlock manages the layout and rewriting of loop instructions. A +// CFGBlock creates possibly multiple copies of the non-terminator, +// non-phi instructions in OrigMBB. It handles PHI insertion and rewriting +// instructions to use previously-computed values. +class CFGBlock { +public: + CFGBlock(CFG &Ctx); + + // Creates a new Stage within this block. + Stage *CreateStage(std::initializer_list Inputs, unsigned StageIdx); + + // Rewrite the instructions in this block to account for feeds from + // predecessors. This must be the final action on this CFGBlock. Returns the + // new block. + MachineBasicBlock *finalize(); + + // Add a single successor to this block. This updates the predecessors list + // of Succ. + void addSuccessor(CFGBlock &Succ); + // Add multiple successors to this block. This updates the predecessors list + // of TrueSucc and FalseSucc. + void addSuccessors(CFGBlock &TrueSucc, CFGBlock &FalseSucc); + + // Set the branch condition (for a multiple-successor block) to Cond. + void setBranchCondition(SmallVector Cond) { + BranchCondition = std::move(Cond); + } + + // Reorder the instructions in this block to be according to the schedule + // order. + void sortAll(); + + CFG &getContext() { return Ctx; } + MachineBasicBlock *getMBB() { return MBB; } + ArrayRef successors() const { return Successors; } + ArrayRef predecessors() const { return Predecessors; } + +private: + CFG &Ctx; + + std::list Stages; + SmallVector Successors; + SmallVector Predecessors; + + // The generated block. + MachineBasicBlock *MBB; + SmallVector BranchCondition; +}; + +// Constructs and maintains state for a series of CFGBlocks. CFG +// drives the cloning of instructions and creation of prolog / epilog blocks. +// It does not insert the new region of blocks into the function. +class CFG { +public: + CFG(MachineLoop *Loop, SMSchedule &Schedule, SwingSchedulerDAG &DAG); + + MachineBasicBlock *getEntryBlock() { return Entry; } + MachineBasicBlock *getExitBlock() { return Exit; } + +private: + friend class Stage; + friend class CFGBlock; + + MachinePipeliner &Pass; + SMSchedule &Schedule; + MachineBasicBlock *OrigBB; + MachineRegisterInfo &MRI; + const TargetInstrInfo *TII; + MachineFunction &MF; + SwingSchedulerDAG &DAG; + + // Map from cloned instruction to its original in OrigBB. + DenseMap Rewrites; + + MachineBasicBlock *Entry, *Exit; +}; + + } // end namespace llvm #endif // LLVM_LIB_CODEGEN_MACHINEPIPELINER_H Index: lib/CodeGen/MachinePipeliner.cpp =================================================================== --- lib/CodeGen/MachinePipeliner.cpp +++ lib/CodeGen/MachinePipeliner.cpp @@ -29,6 +29,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachinePipeliner.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" @@ -54,7 +55,6 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachinePipeliner.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -206,6 +206,7 @@ MLI = &getAnalysis(); MDT = &getAnalysis(); TII = MF->getSubtarget().getInstrInfo(); + MBPI = &getAnalysis(); RegClassInfo.runOnMachineFunction(*MF); for (auto &L : *MLI) @@ -515,7 +516,22 @@ return; } - generatePipelinedLoop(Schedule); + CFG CFG(&Loop, Schedule, *this); + MachineBasicBlock *OrigPreheader = Loop.getLoopPreheader(); + MachineBasicBlock *OrigLoopBlock = Loop.getTopBlock(); + MachineBasicBlock *OrigExitBlock = Loop.getExitBlock(); + OrigPreheader->removeSuccessor(Loop.getTopBlock()); + OrigPreheader->addSuccessor(CFG.getEntryBlock()); + + if (TII->removeBranch(*OrigPreheader) > 0) + TII->insertUnconditionalBranch(*OrigPreheader, CFG.getEntryBlock(), + DebugLoc()); + + CFG.getExitBlock()->addSuccessor(OrigExitBlock); + TII->insertUnconditionalBranch(*CFG.getExitBlock(), OrigExitBlock, + DebugLoc()); + OrigLoopBlock->removeSuccessor(OrigExitBlock); + OrigLoopBlock->eraseFromParent(); ++NumPipelined; } @@ -1989,7 +2005,6 @@ if (SwpMaxStages > -1 && Schedule.getMaxStageCount() > (unsigned)SwpMaxStages) scheduleFound = false; - LLVM_DEBUG({ if (!scheduleFound) dbgs() << "\tCan't schedule\n"; @@ -2012,836 +2027,6 @@ return scheduleFound && Schedule.getMaxStageCount() > 0; } -/// Given a schedule for the loop, generate a new version of the loop, -/// and replace the old version. This function generates a prolog -/// that contains the initial iterations in the pipeline, and kernel -/// loop, and the epilogue that contains the code for the final -/// iterations. -void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) { - // Create a new basic block for the kernel and add it to the CFG. - MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock()); - - unsigned MaxStageCount = Schedule.getMaxStageCount(); - - // Remember the registers that are used in different stages. The index is - // the iteration, or stage, that the instruction is scheduled in. This is - // a map between register names in the original block and the names created - // in each stage of the pipelined loop. - ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2]; - InstrMapTy InstrMap; - - SmallVector PrologBBs; - - MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader(); - assert(PreheaderBB != nullptr && - "Need to add code to handle loops w/o preheader"); - // Generate the prolog instructions that set up the pipeline. - generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs); - MF.insert(BB->getIterator(), KernelBB); - - // Rearrange the instructions to generate the new, pipelined loop, - // and update register names as needed. - for (int Cycle = Schedule.getFirstCycle(), - LastCycle = Schedule.getFinalCycle(); - Cycle <= LastCycle; ++Cycle) { - std::deque &CycleInstrs = Schedule.getInstructions(Cycle); - // This inner loop schedules each instruction in the cycle. - for (SUnit *CI : CycleInstrs) { - if (CI->getInstr()->isPHI()) - continue; - unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr())); - MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum); - updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap); - KernelBB->push_back(NewMI); - InstrMap[NewMI] = CI->getInstr(); - } - } - - // Copy any terminator instructions to the new kernel, and update - // names as needed. - for (MachineBasicBlock::iterator I = BB->getFirstTerminator(), - E = BB->instr_end(); - I != E; ++I) { - MachineInstr *NewMI = MF.CloneMachineInstr(&*I); - updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap); - KernelBB->push_back(NewMI); - InstrMap[NewMI] = &*I; - } - - KernelBB->transferSuccessors(BB); - KernelBB->replaceSuccessor(BB, KernelBB); - - generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, - VRMap, InstrMap, MaxStageCount, MaxStageCount, false); - generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap, - InstrMap, MaxStageCount, MaxStageCount, false); - - LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump();); - - SmallVector EpilogBBs; - // Generate the epilog instructions to complete the pipeline. - generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs, - PrologBBs); - - // We need this step because the register allocation doesn't handle some - // situations well, so we insert copies to help out. - splitLifetimes(KernelBB, EpilogBBs, Schedule); - - // Remove dead instructions due to loop induction variables. - removeDeadInstructions(KernelBB, EpilogBBs); - - // Add branches between prolog and epilog blocks. - addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap); - - // Remove the original loop since it's no longer referenced. - for (auto &I : *BB) - LIS.RemoveMachineInstrFromMaps(I); - BB->clear(); - BB->eraseFromParent(); - - delete[] VRMap; -} - -/// Generate the pipeline prolog code. -void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage, - MachineBasicBlock *KernelBB, - ValueMapTy *VRMap, - MBBVectorTy &PrologBBs) { - MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader(); - assert(PreheaderBB != nullptr && - "Need to add code to handle loops w/o preheader"); - MachineBasicBlock *PredBB = PreheaderBB; - InstrMapTy InstrMap; - - // Generate a basic block for each stage, not including the last stage, - // which will be generated in the kernel. Each basic block may contain - // instructions from multiple stages/iterations. - for (unsigned i = 0; i < LastStage; ++i) { - // Create and insert the prolog basic block prior to the original loop - // basic block. The original loop is removed later. - MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock()); - PrologBBs.push_back(NewBB); - MF.insert(BB->getIterator(), NewBB); - NewBB->transferSuccessors(PredBB); - PredBB->addSuccessor(NewBB); - PredBB = NewBB; - - // Generate instructions for each appropriate stage. Process instructions - // in original program order. - for (int StageNum = i; StageNum >= 0; --StageNum) { - for (MachineBasicBlock::iterator BBI = BB->instr_begin(), - BBE = BB->getFirstTerminator(); - BBI != BBE; ++BBI) { - if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) { - if (BBI->isPHI()) - continue; - MachineInstr *NewMI = - cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule); - updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule, - VRMap); - NewBB->push_back(NewMI); - InstrMap[NewMI] = &*BBI; - } - } - } - rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap); - LLVM_DEBUG({ - dbgs() << "prolog:\n"; - NewBB->dump(); - }); - } - - PredBB->replaceSuccessor(BB, KernelBB); - - // Check if we need to remove the branch from the preheader to the original - // loop, and replace it with a branch to the new loop. - unsigned numBranches = TII->removeBranch(*PreheaderBB); - if (numBranches) { - SmallVector Cond; - TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc()); - } -} - -/// Generate the pipeline epilog code. The epilog code finishes the iterations -/// that were started in either the prolog or the kernel. We create a basic -/// block for each stage that needs to complete. -void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage, - MachineBasicBlock *KernelBB, - ValueMapTy *VRMap, - MBBVectorTy &EpilogBBs, - MBBVectorTy &PrologBBs) { - // We need to change the branch from the kernel to the first epilog block, so - // this call to analyze branch uses the kernel rather than the original BB. - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - SmallVector Cond; - bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond); - assert(!checkBranch && "generateEpilog must be able to analyze the branch"); - if (checkBranch) - return; - - MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin(); - if (*LoopExitI == KernelBB) - ++LoopExitI; - assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor"); - MachineBasicBlock *LoopExitBB = *LoopExitI; - - MachineBasicBlock *PredBB = KernelBB; - MachineBasicBlock *EpilogStart = LoopExitBB; - InstrMapTy InstrMap; - - // Generate a basic block for each stage, not including the last stage, - // which was generated for the kernel. Each basic block may contain - // instructions from multiple stages/iterations. - int EpilogStage = LastStage + 1; - for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) { - MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(); - EpilogBBs.push_back(NewBB); - MF.insert(BB->getIterator(), NewBB); - - PredBB->replaceSuccessor(LoopExitBB, NewBB); - NewBB->addSuccessor(LoopExitBB); - - if (EpilogStart == LoopExitBB) - EpilogStart = NewBB; - - // Add instructions to the epilog depending on the current block. - // Process instructions in original program order. - for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) { - for (auto &BBI : *BB) { - if (BBI.isPHI()) - continue; - MachineInstr *In = &BBI; - if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) { - // Instructions with memoperands in the epilog are updated with - // conservative values. - MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0); - updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap); - NewBB->push_back(NewMI); - InstrMap[NewMI] = In; - } - } - } - generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, - VRMap, InstrMap, LastStage, EpilogStage, i == 1); - generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap, - InstrMap, LastStage, EpilogStage, i == 1); - PredBB = NewBB; - - LLVM_DEBUG({ - dbgs() << "epilog:\n"; - NewBB->dump(); - }); - } - - // Fix any Phi nodes in the loop exit block. - for (MachineInstr &MI : *LoopExitBB) { - if (!MI.isPHI()) - break; - for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) { - MachineOperand &MO = MI.getOperand(i); - if (MO.getMBB() == BB) - MO.setMBB(PredBB); - } - } - - // Create a branch to the new epilog from the kernel. - // Remove the original branch and add a new branch to the epilog. - TII->removeBranch(*KernelBB); - TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc()); - // Add a branch to the loop exit. - if (EpilogBBs.size() > 0) { - MachineBasicBlock *LastEpilogBB = EpilogBBs.back(); - SmallVector Cond1; - TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc()); - } -} - -/// Replace all uses of FromReg that appear outside the specified -/// basic block with ToReg. -static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg, - MachineBasicBlock *MBB, - MachineRegisterInfo &MRI, - LiveIntervals &LIS) { - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg), - E = MRI.use_end(); - I != E;) { - MachineOperand &O = *I; - ++I; - if (O.getParent()->getParent() != MBB) - O.setReg(ToReg); - } - if (!LIS.hasInterval(ToReg)) - LIS.createEmptyInterval(ToReg); -} - -/// Return true if the register has a use that occurs outside the -/// specified loop. -static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB, - MachineRegisterInfo &MRI) { - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg), - E = MRI.use_end(); - I != E; ++I) - if (I->getParent()->getParent() != BB) - return true; - return false; -} - -/// Generate Phis for the specific block in the generated pipelined code. -/// This function looks at the Phis from the original code to guide the -/// creation of new Phis. -void SwingSchedulerDAG::generateExistingPhis( - MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2, - MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap, - InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum, - bool IsLast) { - // Compute the stage number for the initial value of the Phi, which - // comes from the prolog. The prolog to use depends on to which kernel/ - // epilog that we're adding the Phi. - unsigned PrologStage = 0; - unsigned PrevStage = 0; - bool InKernel = (LastStageNum == CurStageNum); - if (InKernel) { - PrologStage = LastStageNum - 1; - PrevStage = CurStageNum; - } else { - PrologStage = LastStageNum - (CurStageNum - LastStageNum); - PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1; - } - - for (MachineBasicBlock::iterator BBI = BB->instr_begin(), - BBE = BB->getFirstNonPHI(); - BBI != BBE; ++BBI) { - unsigned Def = BBI->getOperand(0).getReg(); - - unsigned InitVal = 0; - unsigned LoopVal = 0; - getPhiRegs(*BBI, BB, InitVal, LoopVal); - - unsigned PhiOp1 = 0; - // The Phi value from the loop body typically is defined in the loop, but - // not always. So, we need to check if the value is defined in the loop. - unsigned PhiOp2 = LoopVal; - if (VRMap[LastStageNum].count(LoopVal)) - PhiOp2 = VRMap[LastStageNum][LoopVal]; - - int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI)); - int LoopValStage = - Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal))); - unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum); - if (NumStages == 0) { - // We don't need to generate a Phi anymore, but we need to rename any uses - // of the Phi value. - unsigned NewReg = VRMap[PrevStage][LoopVal]; - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI, - Def, InitVal, NewReg); - if (VRMap[CurStageNum].count(LoopVal)) - VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal]; - } - // Adjust the number of Phis needed depending on the number of prologs left, - // and the distance from where the Phi is first scheduled. The number of - // Phis cannot exceed the number of prolog stages. Each stage can - // potentially define two values. - unsigned MaxPhis = PrologStage + 2; - if (!InKernel && (int)PrologStage <= LoopValStage) - MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1); - unsigned NumPhis = std::min(NumStages, MaxPhis); - - unsigned NewReg = 0; - unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled; - // In the epilog, we may need to look back one stage to get the correct - // Phi name because the epilog and prolog blocks execute the same stage. - // The correct name is from the previous block only when the Phi has - // been completely scheduled prior to the epilog, and Phi value is not - // needed in multiple stages. - int StageDiff = 0; - if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 && - NumPhis == 1) - StageDiff = 1; - // Adjust the computations below when the phi and the loop definition - // are scheduled in different stages. - if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage) - StageDiff = StageScheduled - LoopValStage; - for (unsigned np = 0; np < NumPhis; ++np) { - // If the Phi hasn't been scheduled, then use the initial Phi operand - // value. Otherwise, use the scheduled version of the instruction. This - // is a little complicated when a Phi references another Phi. - if (np > PrologStage || StageScheduled >= (int)LastStageNum) - PhiOp1 = InitVal; - // Check if the Phi has already been scheduled in a prolog stage. - else if (PrologStage >= AccessStage + StageDiff + np && - VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0) - PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal]; - // Check if the Phi has already been scheduled, but the loop instruction - // is either another Phi, or doesn't occur in the loop. - else if (PrologStage >= AccessStage + StageDiff + np) { - // If the Phi references another Phi, we need to examine the other - // Phi to get the correct value. - PhiOp1 = LoopVal; - MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1); - int Indirects = 1; - while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) { - int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1)); - if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects) - PhiOp1 = getInitPhiReg(*InstOp1, BB); - else - PhiOp1 = getLoopPhiReg(*InstOp1, BB); - InstOp1 = MRI.getVRegDef(PhiOp1); - int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1)); - int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0); - if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np && - VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) { - PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1]; - break; - } - ++Indirects; - } - } else - PhiOp1 = InitVal; - // If this references a generated Phi in the kernel, get the Phi operand - // from the incoming block. - if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) - if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB) - PhiOp1 = getInitPhiReg(*InstOp1, KernelBB); - - MachineInstr *PhiInst = MRI.getVRegDef(LoopVal); - bool LoopDefIsPhi = PhiInst && PhiInst->isPHI(); - // In the epilog, a map lookup is needed to get the value from the kernel, - // or previous epilog block. How is does this depends on if the - // instruction is scheduled in the previous block. - if (!InKernel) { - int StageDiffAdj = 0; - if (LoopValStage != -1 && StageScheduled > LoopValStage) - StageDiffAdj = StageScheduled - LoopValStage; - // Use the loop value defined in the kernel, unless the kernel - // contains the last definition of the Phi. - if (np == 0 && PrevStage == LastStageNum && - (StageScheduled != 0 || LoopValStage != 0) && - VRMap[PrevStage - StageDiffAdj].count(LoopVal)) - PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal]; - // Use the value defined by the Phi. We add one because we switch - // from looking at the loop value to the Phi definition. - else if (np > 0 && PrevStage == LastStageNum && - VRMap[PrevStage - np + 1].count(Def)) - PhiOp2 = VRMap[PrevStage - np + 1][Def]; - // Use the loop value defined in the kernel. - else if (static_cast(LoopValStage) > PrologStage + 1 && - VRMap[PrevStage - StageDiffAdj - np].count(LoopVal)) - PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal]; - // Use the value defined by the Phi, unless we're generating the first - // epilog and the Phi refers to a Phi in a different stage. - else if (VRMap[PrevStage - np].count(Def) && - (!LoopDefIsPhi || (PrevStage != LastStageNum) || (LoopValStage == StageScheduled))) - PhiOp2 = VRMap[PrevStage - np][Def]; - } - - // Check if we can reuse an existing Phi. This occurs when a Phi - // references another Phi, and the other Phi is scheduled in an - // earlier stage. We can try to reuse an existing Phi up until the last - // stage of the current Phi. - if (LoopDefIsPhi) { - if (static_cast(PrologStage - np) >= StageScheduled) { - int LVNumStages = Schedule.getStagesForPhi(LoopVal); - int StageDiff = (StageScheduled - LoopValStage); - LVNumStages -= StageDiff; - // Make sure the loop value Phi has been processed already. - if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) { - NewReg = PhiOp2; - unsigned ReuseStage = CurStageNum; - if (Schedule.isLoopCarried(this, *PhiInst)) - ReuseStage -= LVNumStages; - // Check if the Phi to reuse has been generated yet. If not, then - // there is nothing to reuse. - if (VRMap[ReuseStage - np].count(LoopVal)) { - NewReg = VRMap[ReuseStage - np][LoopVal]; - - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, - &*BBI, Def, NewReg); - // Update the map with the new Phi name. - VRMap[CurStageNum - np][Def] = NewReg; - PhiOp2 = NewReg; - if (VRMap[LastStageNum - np - 1].count(LoopVal)) - PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal]; - - if (IsLast && np == NumPhis - 1) - replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS); - continue; - } - } - } - if (InKernel && StageDiff > 0 && - VRMap[CurStageNum - StageDiff - np].count(LoopVal)) - PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal]; - } - - const TargetRegisterClass *RC = MRI.getRegClass(Def); - NewReg = MRI.createVirtualRegister(RC); - - MachineInstrBuilder NewPhi = - BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(), - TII->get(TargetOpcode::PHI), NewReg); - NewPhi.addReg(PhiOp1).addMBB(BB1); - NewPhi.addReg(PhiOp2).addMBB(BB2); - if (np == 0) - InstrMap[NewPhi] = &*BBI; - - // We define the Phis after creating the new pipelined code, so - // we need to rename the Phi values in scheduled instructions. - - unsigned PrevReg = 0; - if (InKernel && VRMap[PrevStage - np].count(LoopVal)) - PrevReg = VRMap[PrevStage - np][LoopVal]; - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI, - Def, NewReg, PrevReg); - // If the Phi has been scheduled, use the new name for rewriting. - if (VRMap[CurStageNum - np].count(Def)) { - unsigned R = VRMap[CurStageNum - np][Def]; - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI, - R, NewReg); - } - - // Check if we need to rename any uses that occurs after the loop. The - // register to replace depends on whether the Phi is scheduled in the - // epilog. - if (IsLast && np == NumPhis - 1) - replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS); - - // In the kernel, a dependent Phi uses the value from this Phi. - if (InKernel) - PhiOp2 = NewReg; - - // Update the map with the new Phi name. - VRMap[CurStageNum - np][Def] = NewReg; - } - - while (NumPhis++ < NumStages) { - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis, - &*BBI, Def, NewReg, 0); - } - - // Check if we need to rename a Phi that has been eliminated due to - // scheduling. - if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal)) - replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS); - } -} - -/// Generate Phis for the specified block in the generated pipelined code. -/// These are new Phis needed because the definition is scheduled after the -/// use in the pipelined sequence. -void SwingSchedulerDAG::generatePhis( - MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2, - MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap, - InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum, - bool IsLast) { - // Compute the stage number that contains the initial Phi value, and - // the Phi from the previous stage. - unsigned PrologStage = 0; - unsigned PrevStage = 0; - unsigned StageDiff = CurStageNum - LastStageNum; - bool InKernel = (StageDiff == 0); - if (InKernel) { - PrologStage = LastStageNum - 1; - PrevStage = CurStageNum; - } else { - PrologStage = LastStageNum - StageDiff; - PrevStage = LastStageNum + StageDiff - 1; - } - - for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(), - BBE = BB->instr_end(); - BBI != BBE; ++BBI) { - for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = BBI->getOperand(i); - if (!MO.isReg() || !MO.isDef() || - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) - continue; - - int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI)); - assert(StageScheduled != -1 && "Expecting scheduled instruction."); - unsigned Def = MO.getReg(); - unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum); - // An instruction scheduled in stage 0 and is used after the loop - // requires a phi in the epilog for the last definition from either - // the kernel or prolog. - if (!InKernel && NumPhis == 0 && StageScheduled == 0 && - hasUseAfterLoop(Def, BB, MRI)) - NumPhis = 1; - if (!InKernel && (unsigned)StageScheduled > PrologStage) - continue; - - unsigned PhiOp2 = VRMap[PrevStage][Def]; - if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2)) - if (InstOp2->isPHI() && InstOp2->getParent() == NewBB) - PhiOp2 = getLoopPhiReg(*InstOp2, BB2); - // The number of Phis can't exceed the number of prolog stages. The - // prolog stage number is zero based. - if (NumPhis > PrologStage + 1 - StageScheduled) - NumPhis = PrologStage + 1 - StageScheduled; - for (unsigned np = 0; np < NumPhis; ++np) { - unsigned PhiOp1 = VRMap[PrologStage][Def]; - if (np <= PrologStage) - PhiOp1 = VRMap[PrologStage - np][Def]; - if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) { - if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB) - PhiOp1 = getInitPhiReg(*InstOp1, KernelBB); - if (InstOp1->isPHI() && InstOp1->getParent() == NewBB) - PhiOp1 = getInitPhiReg(*InstOp1, NewBB); - } - if (!InKernel) - PhiOp2 = VRMap[PrevStage - np][Def]; - - const TargetRegisterClass *RC = MRI.getRegClass(Def); - unsigned NewReg = MRI.createVirtualRegister(RC); - - MachineInstrBuilder NewPhi = - BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(), - TII->get(TargetOpcode::PHI), NewReg); - NewPhi.addReg(PhiOp1).addMBB(BB1); - NewPhi.addReg(PhiOp2).addMBB(BB2); - if (np == 0) - InstrMap[NewPhi] = &*BBI; - - // Rewrite uses and update the map. The actions depend upon whether - // we generating code for the kernel or epilog blocks. - if (InKernel) { - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, - &*BBI, PhiOp1, NewReg); - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, - &*BBI, PhiOp2, NewReg); - - PhiOp2 = NewReg; - VRMap[PrevStage - np - 1][Def] = NewReg; - } else { - VRMap[CurStageNum - np][Def] = NewReg; - if (np == NumPhis - 1) - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, - &*BBI, Def, NewReg); - } - if (IsLast && np == NumPhis - 1) - replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS); - } - } - } -} - -/// Remove instructions that generate values with no uses. -/// Typically, these are induction variable operations that generate values -/// used in the loop itself. A dead instruction has a definition with -/// no uses, or uses that occur in the original loop only. -void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs) { - // For each epilog block, check that the value defined by each instruction - // is used. If not, delete it. - for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(), - MBE = EpilogBBs.rend(); - MBB != MBE; ++MBB) - for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(), - ME = (*MBB)->instr_rend(); - MI != ME;) { - // From DeadMachineInstructionElem. Don't delete inline assembly. - if (MI->isInlineAsm()) { - ++MI; - continue; - } - bool SawStore = false; - // Check if it's safe to remove the instruction due to side effects. - // We can, and want to, remove Phis here. - if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) { - ++MI; - continue; - } - bool used = true; - for (MachineInstr::mop_iterator MOI = MI->operands_begin(), - MOE = MI->operands_end(); - MOI != MOE; ++MOI) { - if (!MOI->isReg() || !MOI->isDef()) - continue; - unsigned reg = MOI->getReg(); - // Assume physical registers are used, unless they are marked dead. - if (TargetRegisterInfo::isPhysicalRegister(reg)) { - used = !MOI->isDead(); - if (used) - break; - continue; - } - unsigned realUses = 0; - for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg), - EI = MRI.use_end(); - UI != EI; ++UI) { - // Check if there are any uses that occur only in the original - // loop. If so, that's not a real use. - if (UI->getParent()->getParent() != BB) { - realUses++; - used = true; - break; - } - } - if (realUses > 0) - break; - used = false; - } - if (!used) { - LIS.RemoveMachineInstrFromMaps(*MI); - MI++->eraseFromParent(); - continue; - } - ++MI; - } - // In the kernel block, check if we can remove a Phi that generates a value - // used in an instruction removed in the epilog block. - for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(), - BBE = KernelBB->getFirstNonPHI(); - BBI != BBE;) { - MachineInstr *MI = &*BBI; - ++BBI; - unsigned reg = MI->getOperand(0).getReg(); - if (MRI.use_begin(reg) == MRI.use_end()) { - LIS.RemoveMachineInstrFromMaps(*MI); - MI->eraseFromParent(); - } - } -} - -/// For loop carried definitions, we split the lifetime of a virtual register -/// that has uses past the definition in the next iteration. A copy with a new -/// virtual register is inserted before the definition, which helps with -/// generating a better register assignment. -/// -/// v1 = phi(a, v2) v1 = phi(a, v2) -/// v2 = phi(b, v3) v2 = phi(b, v3) -/// v3 = .. v4 = copy v1 -/// .. = V1 v3 = .. -/// .. = v4 -void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs, - SMSchedule &Schedule) { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - for (auto &PHI : KernelBB->phis()) { - unsigned Def = PHI.getOperand(0).getReg(); - // Check for any Phi definition that used as an operand of another Phi - // in the same block. - for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def), - E = MRI.use_instr_end(); - I != E; ++I) { - if (I->isPHI() && I->getParent() == KernelBB) { - // Get the loop carried definition. - unsigned LCDef = getLoopPhiReg(PHI, KernelBB); - if (!LCDef) - continue; - MachineInstr *MI = MRI.getVRegDef(LCDef); - if (!MI || MI->getParent() != KernelBB || MI->isPHI()) - continue; - // Search through the rest of the block looking for uses of the Phi - // definition. If one occurs, then split the lifetime. - unsigned SplitReg = 0; - for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI), - KernelBB->instr_end())) - if (BBJ.readsRegister(Def)) { - // We split the lifetime when we find the first use. - if (SplitReg == 0) { - SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def)); - BuildMI(*KernelBB, MI, MI->getDebugLoc(), - TII->get(TargetOpcode::COPY), SplitReg) - .addReg(Def); - } - BBJ.substituteRegister(Def, SplitReg, 0, *TRI); - } - if (!SplitReg) - continue; - // Search through each of the epilog blocks for any uses to be renamed. - for (auto &Epilog : EpilogBBs) - for (auto &I : *Epilog) - if (I.readsRegister(Def)) - I.substituteRegister(Def, SplitReg, 0, *TRI); - break; - } - } - } -} - -/// Remove the incoming block from the Phis in a basic block. -static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) { - for (MachineInstr &MI : *BB) { - if (!MI.isPHI()) - break; - for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) - if (MI.getOperand(i + 1).getMBB() == Incoming) { - MI.RemoveOperand(i + 1); - MI.RemoveOperand(i); - break; - } - } -} - -/// Create branches from each prolog basic block to the appropriate epilog -/// block. These edges are needed if the loop ends before reaching the -/// kernel. -void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB, - MBBVectorTy &PrologBBs, - MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs, - SMSchedule &Schedule, ValueMapTy *VRMap) { - assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch"); - MachineInstr *IndVar = Pass.LI.LoopInductionVar; - MachineInstr *Cmp = Pass.LI.LoopCompare; - MachineBasicBlock *LastPro = KernelBB; - MachineBasicBlock *LastEpi = KernelBB; - - // Start from the blocks connected to the kernel and work "out" - // to the first prolog and the last epilog blocks. - SmallVector PrevInsts; - unsigned MaxIter = PrologBBs.size() - 1; - unsigned LC = UINT_MAX; - unsigned LCMin = UINT_MAX; - for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) { - // Add branches to the prolog that go to the corresponding - // epilog, and the fall-thru prolog/kernel block. - MachineBasicBlock *Prolog = PrologBBs[j]; - MachineBasicBlock *Epilog = EpilogBBs[i]; - // We've executed one iteration, so decrement the loop count and check for - // the loop end. - SmallVector Cond; - // Check if the LOOP0 has already been removed. If so, then there is no need - // to reduce the trip count. - if (LC != 0) - LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond, - PrevInsts, j, MaxIter); - - // Record the value of the first trip count, which is used to determine if - // branches and blocks can be removed for constant trip counts. - if (LCMin == UINT_MAX) - LCMin = LC; - - unsigned numAdded = 0; - if (TargetRegisterInfo::isVirtualRegister(LC)) { - Prolog->addSuccessor(Epilog); - numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc()); - } else if (j >= LCMin) { - Prolog->addSuccessor(Epilog); - Prolog->removeSuccessor(LastPro); - LastEpi->removeSuccessor(Epilog); - numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc()); - removePhis(Epilog, LastEpi); - // Remove the blocks that are no longer referenced. - if (LastPro != LastEpi) { - LastEpi->clear(); - LastEpi->eraseFromParent(); - } - LastPro->clear(); - LastPro->eraseFromParent(); - } else { - numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc()); - removePhis(Epilog, Prolog); - } - LastPro = Prolog; - LastEpi = Epilog; - for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(), - E = Prolog->instr_rend(); - I != E && numAdded > 0; ++I, --numAdded) - updateInstruction(&*I, false, j, 0, Schedule, VRMap); - } -} - /// Return true if we can compute the amount the instruction changes /// during each iteration. Set Delta to the amount of the change. bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) { @@ -2907,228 +2092,6 @@ NewMI.setMemRefs(MF, NewMMOs); } -/// Clone the instruction for the new pipelined loop and update the -/// memory operands, if needed. -MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI, - unsigned CurStageNum, - unsigned InstStageNum) { - MachineInstr *NewMI = MF.CloneMachineInstr(OldMI); - // Check for tied operands in inline asm instructions. This should be handled - // elsewhere, but I'm not sure of the best solution. - if (OldMI->isInlineAsm()) - for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) { - const auto &MO = OldMI->getOperand(i); - if (MO.isReg() && MO.isUse()) - break; - unsigned UseIdx; - if (OldMI->isRegTiedToUseOperand(i, &UseIdx)) - NewMI->tieOperands(i, UseIdx); - } - updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum); - return NewMI; -} - -/// Clone the instruction for the new pipelined loop. If needed, this -/// function updates the instruction using the values saved in the -/// InstrChanges structure. -MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI, - unsigned CurStageNum, - unsigned InstStageNum, - SMSchedule &Schedule) { - MachineInstr *NewMI = MF.CloneMachineInstr(OldMI); - DenseMap>::iterator It = - InstrChanges.find(getSUnit(OldMI)); - if (It != InstrChanges.end()) { - std::pair RegAndOffset = It->second; - unsigned BasePos, OffsetPos; - if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos)) - return nullptr; - int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm(); - MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first); - if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum) - NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum); - NewMI->getOperand(OffsetPos).setImm(NewOffset); - } - updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum); - return NewMI; -} - -/// Update the machine instruction with new virtual registers. This -/// function may change the defintions and/or uses. -void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef, - unsigned CurStageNum, - unsigned InstrStageNum, - SMSchedule &Schedule, - ValueMapTy *VRMap) { - for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = NewMI->getOperand(i); - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) - continue; - unsigned reg = MO.getReg(); - if (MO.isDef()) { - // Create a new virtual register for the definition. - const TargetRegisterClass *RC = MRI.getRegClass(reg); - unsigned NewReg = MRI.createVirtualRegister(RC); - MO.setReg(NewReg); - VRMap[CurStageNum][reg] = NewReg; - if (LastDef) - replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS); - } else if (MO.isUse()) { - MachineInstr *Def = MRI.getVRegDef(reg); - // Compute the stage that contains the last definition for instruction. - int DefStageNum = Schedule.stageScheduled(getSUnit(Def)); - unsigned StageNum = CurStageNum; - if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) { - // Compute the difference in stages between the defintion and the use. - unsigned StageDiff = (InstrStageNum - DefStageNum); - // Make an adjustment to get the last definition. - StageNum -= StageDiff; - } - if (VRMap[StageNum].count(reg)) - MO.setReg(VRMap[StageNum][reg]); - } - } -} - -/// Return the instruction in the loop that defines the register. -/// If the definition is a Phi, then follow the Phi operand to -/// the instruction in the loop. -MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) { - SmallPtrSet Visited; - MachineInstr *Def = MRI.getVRegDef(Reg); - while (Def->isPHI()) { - if (!Visited.insert(Def).second) - break; - for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) - if (Def->getOperand(i + 1).getMBB() == BB) { - Def = MRI.getVRegDef(Def->getOperand(i).getReg()); - break; - } - } - return Def; -} - -/// Return the new name for the value from the previous stage. -unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage, - unsigned LoopVal, unsigned LoopStage, - ValueMapTy *VRMap, - MachineBasicBlock *BB) { - unsigned PrevVal = 0; - if (StageNum > PhiStage) { - MachineInstr *LoopInst = MRI.getVRegDef(LoopVal); - if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal)) - // The name is defined in the previous stage. - PrevVal = VRMap[StageNum - 1][LoopVal]; - else if (VRMap[StageNum].count(LoopVal)) - // The previous name is defined in the current stage when the instruction - // order is swapped. - PrevVal = VRMap[StageNum][LoopVal]; - else if (!LoopInst->isPHI() || LoopInst->getParent() != BB) - // The loop value hasn't yet been scheduled. - PrevVal = LoopVal; - else if (StageNum == PhiStage + 1) - // The loop value is another phi, which has not been scheduled. - PrevVal = getInitPhiReg(*LoopInst, BB); - else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB) - // The loop value is another phi, which has been scheduled. - PrevVal = - getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB), - LoopStage, VRMap, BB); - } - return PrevVal; -} - -/// Rewrite the Phi values in the specified block to use the mappings -/// from the initial operand. Once the Phi is scheduled, we switch -/// to using the loop value instead of the Phi value, so those names -/// do not need to be rewritten. -void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB, - unsigned StageNum, - SMSchedule &Schedule, - ValueMapTy *VRMap, - InstrMapTy &InstrMap) { - for (auto &PHI : BB->phis()) { - unsigned InitVal = 0; - unsigned LoopVal = 0; - getPhiRegs(PHI, BB, InitVal, LoopVal); - unsigned PhiDef = PHI.getOperand(0).getReg(); - - unsigned PhiStage = - (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef))); - unsigned LoopStage = - (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal))); - unsigned NumPhis = Schedule.getStagesForPhi(PhiDef); - if (NumPhis > StageNum) - NumPhis = StageNum; - for (unsigned np = 0; np <= NumPhis; ++np) { - unsigned NewVal = - getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB); - if (!NewVal) - NewVal = InitVal; - rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI, - PhiDef, NewVal); - } - } -} - -/// Rewrite a previously scheduled instruction to use the register value -/// from the new instruction. Make sure the instruction occurs in the -/// basic block, and we don't change the uses in the new instruction. -void SwingSchedulerDAG::rewriteScheduledInstr( - MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap, - unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, - unsigned NewReg, unsigned PrevReg) { - bool InProlog = (CurStageNum < Schedule.getMaxStageCount()); - int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum; - // Rewrite uses that have been scheduled already to use the new - // Phi register. - for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg), - EI = MRI.use_end(); - UI != EI;) { - MachineOperand &UseOp = *UI; - MachineInstr *UseMI = UseOp.getParent(); - ++UI; - if (UseMI->getParent() != BB) - continue; - if (UseMI->isPHI()) { - if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg) - continue; - if (getLoopPhiReg(*UseMI, BB) != OldReg) - continue; - } - InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI); - assert(OrigInstr != InstrMap.end() && "Instruction not scheduled."); - SUnit *OrigMISU = getSUnit(OrigInstr->second); - int StageSched = Schedule.stageScheduled(OrigMISU); - int CycleSched = Schedule.cycleScheduled(OrigMISU); - unsigned ReplaceReg = 0; - // This is the stage for the scheduled instruction. - if (StagePhi == StageSched && Phi->isPHI()) { - int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi)); - if (PrevReg && InProlog) - ReplaceReg = PrevReg; - else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) && - (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI())) - ReplaceReg = PrevReg; - else - ReplaceReg = NewReg; - } - // The scheduled instruction occurs before the scheduled Phi, and the - // Phi is not loop carried. - if (!InProlog && StagePhi + 1 == StageSched && - !Schedule.isLoopCarried(this, *Phi)) - ReplaceReg = NewReg; - if (StagePhi > StageSched && Phi->isPHI()) - ReplaceReg = NewReg; - if (!InProlog && !Phi->isPHI() && StagePhi < StageSched) - ReplaceReg = NewReg; - if (ReplaceReg) { - MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg)); - UseOp.setReg(ReplaceReg); - } - } -} - /// Check if we can change the instruction to use an offset value from the /// previous iteration. If so, return true and set the base and offset values /// so that we can rewrite the load, if necessary. @@ -3190,6 +2153,24 @@ return true; } +/// Return the instruction in the loop that defines the register. +/// If the definition is a Phi, then follow the Phi operand to +/// the instruction in the loop. +MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) { + SmallPtrSet Visited; + MachineInstr *Def = MRI.getVRegDef(Reg); + while (Def->isPHI()) { + if (!Visited.insert(Def).second) + break; + for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) + if (Def->getOperand(i + 1).getMBB() == BB) { + Def = MRI.getVRegDef(Def->getOperand(i).getReg()); + break; + } + } + return Def; +} + /// Apply changes to the instruction if needed. The changes are need /// to improve the scheduling and depend up on the final schedule. void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI, @@ -3257,6 +2238,13 @@ if (!DI->mayStore() || !SI->mayLoad()) return false; + if (!SI->mayAlias(AAForDep, *DI, /*UseTBAA=*/true) || + !DI->mayAlias(AAForDep, *SI, /*UseTBAA=*/true)) + return false; + if (DI->hasOneMemOperand() && (*DI->memoperands_begin())->getPseudoValue()) + return false; + dbgs() << "ISLoopCarried: "; DI->print(dbgs(), true, true); dbgs() << " + "; + SI->print(dbgs(), true, true); dbgs() << "\n"; unsigned DeltaS, DeltaD; if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD)) return true; @@ -3295,7 +2283,7 @@ if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD) return true; - + dbgs() << "probably not\n"; return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD); } @@ -3322,7 +2310,6 @@ int termCycle = forward ? EndCycle + 1 : EndCycle - 1; for (int curCycle = StartCycle; curCycle != termCycle; forward ? ++curCycle : --curCycle) { - // Add the already scheduled instructions at the specified cycle to the // DFA. ProcItinResources.clearResources(); @@ -4078,3 +3065,344 @@ std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0); } +CFG::CFG(MachineLoop *Loop, SMSchedule &Schedule, SwingSchedulerDAG &DAG) : + Pass(DAG.getPass()), Schedule(Schedule), MRI(DAG.MF.getRegInfo()), + MF(DAG.MF), DAG(DAG) { + TII = MF.getSubtarget().getInstrInfo(); + OrigBB = Loop->getTopBlock(); + + MachineBasicBlock &Preheader = *Loop->getLoopPreheader(); + unsigned NumStages = Schedule.getMaxStageCount() + 1; + + // Analyze the original loop to extract the main loop condition. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector KernelLoopCond; + TII->analyzeBranch(*OrigBB, TBB, FBB, KernelLoopCond); + + // State for TII->reduceLoopCount. + MachineInstr *IndVar = Pass.LI.LoopInductionVar; + MachineInstr *Cmp = Pass.LI.LoopCompare; + SmallVector PrevInsts; + + // We create a ladder of prolog and epilog blocks. Each prolog block tests for + // termination and if so jumps to an epilog block; + // + // prolog1 + // i0[stage 0] + // done? goto epilog1 + // + // prolog2 + // i0[stage 1] + // i1[stage 0] + // done? goto epilog2 + // + // kernel + // i0..(n-2)[stage 2] + // i1..(n-1)[stage 1] + // i2..n [stage 0] + // done? goto epilog2 else goto kernel + // + // epilog2 + // i(n-1) [stage 2] + // goto epilog1 + // + // epilog1 + // in [stage 1] + // in [stage 2] + // goto loop exit + // + // Note that this approach allows us to deal with any trip count, but means + // we must complete loop iterations during the epilog in the *reverse* order + // they were started. If all iterations are purely independent this may not + // be an issue but this may generate incorrect code if we have a loop-carried + // dependence (which is why we are conservative with loop-carried + // dependences). + + std::deque PrologStages, EpilogStages; + std::vector Blocks; + std::vector Prologs, Epilogs; + Prologs.reserve(NumStages); + Epilogs.reserve(NumStages); + + LLVM_DEBUG(dbgs() << "Generate prolog\n"); + // Generate prolog blocks until we have NumStages - 1 iterations in flight. + // The NumStages'th iteration will be launched by the kernel. + for (unsigned PrologStage = 0; PrologStage < NumStages - 1; ++PrologStage) { + CFGBlock PB(*this); + for (unsigned I = 0; I <= PrologStage; ++I) { + LLVM_DEBUG(dbgs() << " Clone stage " << I << " for prolog block " + << Prologs.size() << "\n"); + if (I == 0 && PrologStages.empty()) { + PrologStages.push_back(PB.CreateStage({}, I)); + } else { + PrologStages.push_back(PB.CreateStage({PrologStages.front()}, I)); + if (I > 0) + PrologStages.pop_front(); + } + } + + SmallVector Cond; + // TODO(jmolloy): This API is really too overfit for Hexagon and the + // previous implementation's details. It should be refactored. + TII->reduceLoopCount(*PB.getMBB(), Preheader, IndVar, *Cmp, Cond, + PrevInsts, PrologStage, NumStages - 1); + PB.setBranchCondition(std::move(Cond)); + + Blocks.push_back(PB.finalize()); + Prologs.push_back(std::move(PB)); + } + LLVM_DEBUG(dbgs() << "Generate kernel\n"); + + // Create the kernel block. + CFGBlock KernelPB(*this); + + // We create the kernel by creating one instance of each stage. We create + // stage zero last; Stage zero takes stage one as input for loop-carried phis. + // Iteration a: [0 | 1 | 2 ] + // Iteration b: [ 0 | 1 | 2] + // ^ b[0] may depend on a[1] for loop-carried phis. If + // b[0] depended on a[2], the schedule would be + // invalid. + // + std::deque KernelStages(1, nullptr); + for (int I = 1; I < NumStages; ++I) + KernelStages.push_back( + KernelPB.CreateStage({PrologStages[I - 1], nullptr}, I)); + KernelStages[0] = KernelPB.CreateStage({KernelStages[1]}, 0); + for (int I = 1; I < NumStages; ++I) + KernelStages[I]->setLastInput(KernelStages[I - 1]); + + KernelPB.setBranchCondition(std::move(KernelLoopCond)); + // Now interleave all the instructions in the kernel in schedule order. + KernelPB.sortAll(); + LLVM_DEBUG(dbgs() << "Generate epilog\n"); + + // Seed the Epilog with the final kernel state. + EpilogStages = KernelStages; + + // The last stage is complete so can be discarded. + EpilogStages.pop_back(); + + assert(PrologStages.size() == EpilogStages.size()); + Stage *LastStage = nullptr; + for (int PeelStage = NumStages - 1; PeelStage > 0; --PeelStage) { + CFGBlock PB(*this); + // In this epilog block our input blocks define stages [0,PeelStage]. This + // block will peel off and complete the PeelStage'th iteration. + LLVM_DEBUG(dbgs() << " Clone stage " << PeelStage << " for epilog block " + << Epilogs.size() << "\n"); + Stage *S = + PB.CreateStage({EpilogStages.back(), PrologStages.back()}, PeelStage); + for (unsigned I = PeelStage + 1; I < NumStages; ++I) { + LLVM_DEBUG(dbgs() << " Clone stage " << I << " for epilog block " + << Epilogs.size() << "\n"); + S = PB.CreateStage({S}, I); + } + PrologStages.pop_back(); + EpilogStages.pop_back(); + LastStage = S; + + // Pass through all stages before PeelStage. We do this by creating a dummy + // stage to force PHI creation. + for (unsigned I = 0; I < PeelStage - 1; ++I) + EpilogStages[I] = PB.CreateStage({PrologStages[I], EpilogStages[I]}, ~1U); + + // Connect up the associated prolog block to both its normal successor and + // this epilog. + Prologs[PeelStage - 1].addSuccessors( + PB, PeelStage == NumStages - 1 ? KernelPB : Prologs[PeelStage]); + if (!Epilogs.empty()) { + // We're not the first epilog block, so the previous epilog block falls + // through to us. + Epilogs.back().addSuccessor(PB); + } else { + // We are the first epilog block, so the kernel falls through to us. + KernelPB.addSuccessors(KernelPB, PB); + Blocks.push_back(KernelPB.finalize()); + } + Blocks.push_back(PB.finalize()); + Epilogs.push_back(std::move(PB)); + } + + LastStage->rewriteUsesOutsideOfLoop(); + + LLVM_DEBUG(for (auto *MBB : Blocks) MBB->dump();); + Entry = Blocks.front(); + Exit = Blocks.back(); +} + +Stage::Stage(CFGBlock *Block, std::initializer_list Ins, + unsigned StageIdx) + : Ctx(Block->getContext()), Block(Block), MBB(Block->getMBB()), + Inputs(Ins.begin(), Ins.end()) { + auto &MRI = Ctx.MRI; + if (Inputs.size() > 0) { + Remaps = Inputs[0]->remaps(); + } else { + // No inputs; loop-carried PHIs take their init value. + for (MachineInstr &MI : Ctx.OrigBB->phis()) + Remaps[MI.getOperand(0).getReg()] = getInitPhiReg(MI, Ctx.OrigBB); + } + + // On stage zero our input should only be used to collect loop-carried PHI + // values. No other outputs from previous stages should be considered. + bool OnlyLoopPhis = StageIdx == 0; + + // Start by creating PHIs for all live inputs. + auto InsertPt = MBB->getFirstNonPHI(); + if (!Inputs.empty()) + for (auto &KV : Inputs[0]->remaps()) { + if (OnlyLoopPhis && !MRI.getUniqueVRegDef(KV.first)->isPHI()) + continue; + if (Inputs.size() == 1) { + Remaps.insert(KV); + continue; + } + unsigned OrigReg = KV.first; + unsigned DestReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg)); + Phis.push_back(BuildMI(*MBB, InsertPt, DebugLoc(), + Ctx.TII->get(TargetOpcode::PHI), DestReg)); + PhiRegs[Phis.back()] = OrigReg; + Remaps[OrigReg] = DestReg; + } + + SmallVector SortedInstrs; + for (auto &MI : *Ctx.OrigBB) { + if (Ctx.Schedule.isScheduledAtStage(Ctx.DAG.getSUnit(&MI), StageIdx) && + !MI.isPHI()) + SortedInstrs.push_back(&MI); + } + sort(SortedInstrs, [&](MachineInstr *A, MachineInstr *B) { + return Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(A)) < + Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(B)); + }); + + for (auto *MI : SortedInstrs) { + MachineInstr *NewMI = Ctx.MF.CloneMachineInstr(MI); + MBB->insert(MBB->end(), NewMI); + for (auto &MO : NewMI->defs()) { + unsigned OrigReg = MO.getReg(); + unsigned DestReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg)); + NewMI->substituteRegister(OrigReg, DestReg, MO.getSubReg(), + *MRI.getTargetRegisterInfo()); + Remaps[OrigReg] = DestReg; + } + Instrs.push_back(NewMI); + Ctx.Rewrites[NewMI] = MI; + + LLVM_DEBUG(dbgs() << " add (stage " << StageIdx << ") " << *NewMI); + } + + PostRemaps = Remaps; + for (auto &MI : *Ctx.OrigBB) { + if (MI.isPHI() && Remaps.count(getLoopPhiReg(MI, Ctx.OrigBB))) + PostRemaps[MI.getOperand(0).getReg()] = + Remaps[getLoopPhiReg(MI, Ctx.OrigBB)]; + } +} + +void Stage::finalize() { + if (Inputs.size() > 1) { + assert(Inputs.size() == 2); + const DenseMap &R0 = Inputs[0]->remaps(); + const DenseMap &R1 = Inputs[1]->remaps(); + for (MachineInstr *Phi : Phis) { + unsigned DestReg = Phi->getOperand(0).getReg(); + unsigned OrigReg = PhiRegs[Phi]; + + MachineInstrBuilder B(*Phi->getParent()->getParent(), Phi); + unsigned R0Reg = R0.lookup(OrigReg); + B.addReg(R0Reg).addMBB(Inputs[0]->getMBB()); + Ctx.MRI.constrainRegClass(DestReg, Ctx.MRI.getRegClass(R0Reg)); + + // Even though we may have been defined with two inputs it may turn out + // that we only have one predecessor. + if (Block->predecessors().size() > 1) { + unsigned R1Reg = R1.lookup(OrigReg); + B.addReg(R1Reg).addMBB(Inputs[1]->getMBB()); + Ctx.MRI.constrainRegClass(DestReg, Ctx.MRI.getRegClass(R1Reg)); + } + } + } + + for (MachineInstr *MI : Instrs) { + for (auto &MO : MI->uses()) { + if (!MO.isReg() || !Remaps.count(MO.getReg())) + continue; + Ctx.MRI.constrainRegClass(Remaps[MO.getReg()], + Ctx.MRI.getRegClass(MO.getReg())); + MO.ChangeToRegister(Remaps[MO.getReg()], /*isDef=*/false); + } + } +} + +void Stage::rewriteUsesOutsideOfLoop() { + for (auto &KV : remaps()) { + unsigned RewrittenReg = KV.second; + unsigned OrigReg = KV.first; + for (MachineOperand &MO : Ctx.MRI.use_operands(OrigReg)) { + if (MO.getParent()->getParent() != Ctx.OrigBB) + MO.ChangeToRegister(RewrittenReg, /*isDef=*/false); + } + } +} + +CFGBlock::CFGBlock(CFG &Ctx) : Ctx(Ctx) { + MBB = Ctx.MF.CreateMachineBasicBlock(Ctx.OrigBB->getBasicBlock()); + Ctx.MF.insert(Ctx.OrigBB->getIterator(), MBB); +} + +void CFGBlock::sortAll() { + std::vector Instrs; + DenseMap Rewrites; + for (auto &S : Stages) { + Instrs.insert(Instrs.end(), S.getInstrs().begin(), S.getInstrs().end()); + } + sort(Instrs, [&](MachineInstr *A, MachineInstr *B) { + return Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(Ctx.Rewrites[A])) < + Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(Ctx.Rewrites[B])); + }); + for (MachineInstr *MI : Instrs) { + MI->removeFromParent(); + MBB->insert(MBB->end(), MI); + } +} + +MachineBasicBlock *CFGBlock::finalize() { + for (auto &S : Stages) + S.finalize(); + return MBB; +} + +Stage *CFGBlock::CreateStage(std::initializer_list Inputs, unsigned StageIdx) { + Stages.emplace_back(this, Inputs, StageIdx); + return &Stages.back(); +} + +void CFGBlock::addSuccessor(CFGBlock &Succ) { + MBB->addSuccessor(Succ.MBB); + Succ.Predecessors.push_back(this); + assert(Ctx.TII->insertUnconditionalBranch(*MBB, Succ.MBB, DebugLoc()) != 0); +} + +void CFGBlock::addSuccessors(CFGBlock &TrueSucc, + CFGBlock &FalseSucc) { + FalseSucc.Predecessors.push_back(this); + if (BranchCondition.empty()) { + MBB->addSuccessor(FalseSucc.MBB); + Ctx.TII->insertUnconditionalBranch(*MBB, FalseSucc.MBB, DebugLoc()); + return; + } + TrueSucc.Predecessors.push_back(this); + + // We do this dance to transfer the successor probabilities from the loop + // block. This is valid for all prolog blocks and the kernel block. + auto TrueI = Ctx.OrigBB->succ_begin(); + auto FalseI = std::next(TrueI); + MBB->addSuccessor(TrueSucc.MBB, + Ctx.Pass.MBPI->getEdgeProbability(Ctx.OrigBB, TrueI)); + MBB->addSuccessor(FalseSucc.MBB, + Ctx.Pass.MBPI->getEdgeProbability(Ctx.OrigBB, FalseI)); + assert(Ctx.TII->insertBranch(*MBB, TrueSucc.MBB, FalseSucc.MBB, + BranchCondition, DebugLoc()) != 0); +}