Index: include/llvm/CodeGen/MachinePipeliner.h =================================================================== --- include/llvm/CodeGen/MachinePipeliner.h +++ include/llvm/CodeGen/MachinePipeliner.h @@ -298,6 +298,7 @@ void checkValidNodeOrder(const NodeSetType &Circuits) const; bool schedulePipeline(SMSchedule &Schedule); void generatePipelinedLoop(SMSchedule &Schedule); + void experimentalGeneratePipeline(SMSchedule &Schedule); void generateProlog(SMSchedule &Schedule, unsigned LastStage, MachineBasicBlock *KernelBB, ValueMapTy *VRMap, MBBVectorTy &PrologBBs); @@ -316,7 +317,8 @@ unsigned CurStageNum, bool IsLast); void removeDeadInstructions(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs); - void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, + void splitLifetimes(MachineBasicBlock *KernelBB, + ArrayRef EpilogBBs, SMSchedule &Schedule); void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, @@ -347,11 +349,16 @@ bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos, unsigned &OffsetPos, unsigned &NewBase, int64_t &NewOffset); + void verifyExperimentalCodegen(MachineBasicBlock *LegacyKernel, + MachineBasicBlock *NewKernel, + ArrayRef LegacyPrologs, + ArrayRef NewPrologs); void postprocessDAG(); /// Set the Minimum Initiation Interval for this schedule attempt. void setMII(unsigned ResMII, unsigned RecMII); /// Set the Maximum Initiation Interval for this schedule attempt. void setMAX_II(); + }; /// A NodeSet contains a set of SUnit DAG nodes with additional information Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -79,6 +79,7 @@ MachineInstrBundle.cpp MachineInstr.cpp MachineLICM.cpp + MachineLoopUtils.cpp MachineLoopInfo.cpp MachineModuleInfo.cpp MachineModuleInfoImpls.cpp Index: lib/CodeGen/MachineLoopUtils.h =================================================================== --- /dev/null +++ lib/CodeGen/MachineLoopUtils.h @@ -0,0 +1,41 @@ +//=- MachineLoopUtils.h - Helper functions for manipulating loops -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H +#define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H + +namespace llvm { +class MachineBasicBlock; +class MachineRegisterInfo; +class TargetInstrInfo; + +enum LoopPeelDirection { + LPD_Front, ///< Peel the first iteration of the loop. + LPD_Back ///< Peel the last iteration of the loop. +}; + +/// Peels a single block loop. Loop must have two successors, one of which +/// must be itself. Similarly it must have two predecessors, one of which must +/// be itself. +/// +/// The loop block is copied and inserted into the CFG such that two copies of +/// the loop follow on from each other. The copy is inserted either before or +/// after the loop based on Direction. +/// +/// Phis are updated and an unconditional branch inserted at the end of the +/// clone so as to execute a single iteration. +/// +/// The trip count of Loop is not updated. +MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction, + MachineBasicBlock *Loop, + MachineRegisterInfo &MRI, + const TargetInstrInfo *TII); + +} // namespace llvm + +#endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H Index: lib/CodeGen/MachineLoopUtils.cpp =================================================================== --- /dev/null +++ lib/CodeGen/MachineLoopUtils.cpp @@ -0,0 +1,143 @@ +//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MachineLoopUtils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +using namespace llvm; + +namespace { +// MI's parent and BB are clones of each other. Find the equivalent copy of MI +// in BB. +MachineInstr &findEquivalentInstruction(MachineInstr &MI, + MachineBasicBlock *BB) { + MachineBasicBlock *PB = MI.getParent(); + unsigned Offset = std::distance(PB->instr_begin(), MI.getIterator()); + return *std::next(BB->instr_begin(), Offset); +} + +void updateIncomingPHIs(MachineBasicBlock *Succ, MachineBasicBlock *OldPred, + MachineBasicBlock *NewPred) { + for (auto I = Succ->begin(); I != Succ->end() && I->isPHI(); ++I) + for (unsigned Idx = I->getNumExplicitDefs(); + Idx < I->getNumExplicitOperands(); ++Idx) + if (I->getOperand(Idx).isMBB() && + I->getOperand(Idx).getMBB() == OldPred) { + I->getOperand(Idx).setMBB(NewPred); + } +} + +} // namespace + +MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction, + MachineBasicBlock *Loop, + MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) { + MachineFunction &MF = *Loop->getParent(); + MachineBasicBlock *Preheader = *Loop->pred_begin(); + if (Preheader == Loop) + Preheader = *std::next(Loop->pred_begin()); + MachineBasicBlock *Exit = *Loop->succ_begin(); + if (Exit == Loop) + Exit = *std::next(Loop->succ_begin()); + + MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock()); + if (Direction == LPD_Front) + MF.insert(Loop->getIterator(), NewBB); + else + MF.insert(std::next(Loop->getIterator()), NewBB); + + DenseMap Remaps; + auto InsertPt = NewBB->end(); + for (MachineInstr &MI : *Loop) { + MachineInstr *NewMI = MF.CloneMachineInstr(&MI); + NewBB->insert(InsertPt, NewMI); + for (MachineOperand &MO : NewMI->defs()) { + Register OrigR = MO.getReg(); + if (OrigR.isPhysical()) + // Remapping physical registers makes no sense. + continue; + Register &R = Remaps[OrigR]; + R = MRI.createVirtualRegister(MRI.getRegClass(OrigR)); + MO.setReg(R); + + if (Direction == LPD_Back) { + // Replace all uses outside the original loop with the new register. + // FIXME: is the use_iterator stable enough to mutate register uses + // while iterating? + SmallVector Uses; + for (auto &Use : MRI.use_operands(OrigR)) + if (Use.getParent()->getParent() != Loop) + Uses.push_back(&Use); + for (auto *Use : Uses) { + MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg())); + Use->setReg(R); + } + } + } + } + + for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I) + for (MachineOperand &MO : I->uses()) + if (MO.isReg() && Remaps.count(MO.getReg())) + MO.setReg(Remaps[MO.getReg()]); + + for (auto I = NewBB->begin(); I->isPHI(); ++I) { + MachineInstr &MI = *I; + unsigned LoopRegIdx = 3, InitRegIdx = 1; + if (MI.getOperand(2).getMBB() != Preheader) + std::swap(LoopRegIdx, InitRegIdx); + MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop); + assert(OrigPhi.isPHI()); + if (Direction == LPD_Front) { + // When peeling front, we are only left with the initial value from the + // preheader. + Register R = MI.getOperand(LoopRegIdx).getReg(); + if (Remaps.count(R)) + R = Remaps[R]; + OrigPhi.getOperand(InitRegIdx).setReg(R); + MI.RemoveOperand(LoopRegIdx + 1); + MI.RemoveOperand(LoopRegIdx + 0); + } else { + // When peeling back, the initial value is the loop-carried value from + // the original loop. + Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg(); + MI.getOperand(LoopRegIdx).setReg(LoopReg); + MI.RemoveOperand(InitRegIdx + 1); + MI.RemoveOperand(InitRegIdx + 0); + } + } + + DebugLoc DL; + if (Direction == LPD_Front) { + Preheader->replaceSuccessor(Loop, NewBB); + NewBB->addSuccessor(Loop); + updateIncomingPHIs(Loop, Preheader, NewBB); + if (TII->removeBranch(*Preheader) > 0) + TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL); + TII->removeBranch(*NewBB); + TII->insertBranch(*NewBB, Loop, nullptr, {}, DL); + } else { + Loop->replaceSuccessor(Exit, NewBB); + updateIncomingPHIs(Exit, Loop, NewBB); + NewBB->addSuccessor(Exit); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + assert(!TII->analyzeBranch(*Loop, TBB, FBB, Cond) && + "Must be able to analyze the loop branch!"); + TII->removeBranch(*Loop); + TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB, + FBB == Exit ? NewBB : FBB, Cond, DL); + if (TII->removeBranch(*NewBB) > 0) + TII->insertBranch(*NewBB, Exit, nullptr, {}, DL); + } + + return NewBB; +} Index: lib/CodeGen/MachinePipeliner.cpp =================================================================== --- lib/CodeGen/MachinePipeliner.cpp +++ lib/CodeGen/MachinePipeliner.cpp @@ -29,6 +29,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachinePipeliner.h" +#include "MachineLoopUtils.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" @@ -54,7 +56,6 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachinePipeliner.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -160,9 +161,14 @@ SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden, cl::init(true), cl::ZeroOrMore, cl::desc("Enable CopyToPhi DAG Mutation")); - } // end namespace llvm +/// Enables the experimental code generator. +static cl::opt EnableExperimentalCodeGen( + "swp-experimental-cg", + cl::desc("Enable experimental CG code in MachinePipeliner."), cl::Hidden, + cl::init(false)); + unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5; char MachinePipeliner::ID = 0; #ifndef NDEBUG @@ -515,7 +521,11 @@ return; } - generatePipelinedLoop(Schedule); + bool CanUseExperimentalCodeGen = InstrChanges.empty(); + if (EnableExperimentalCodeGen && CanUseExperimentalCodeGen) + experimentalGeneratePipeline(Schedule); + else + generatePipelinedLoop(Schedule); ++NumPipelined; } @@ -2713,7 +2723,7 @@ /// .. = V1 v3 = .. /// .. = v4 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs, + ArrayRef EpilogBBs, SMSchedule &Schedule) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (auto &PHI : KernelBB->phis()) { @@ -4086,3 +4096,707 @@ std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0); } +//===----------------------------------------------------------------------===// +// Experimental code generation code +//===----------------------------------------------------------------------===// + +namespace { +// Remove any dead phis in MBB. Dead phis either have only one block as input +// (in which case they are the identity) or have no uses. +void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI, + LiveIntervals &LIS) { + bool Changed = true; + while (Changed) { + Changed = false; + for (auto I = MBB->begin(); I != MBB->getFirstNonPHI();) { + MachineInstr &MI = *I++; + assert(MI.isPHI()); + if (MRI.use_empty(MI.getOperand(0).getReg())) { + LIS.RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + Changed = true; + } else if (MI.getNumExplicitOperands() == 3) { + MRI.constrainRegClass(MI.getOperand(1).getReg(), + MRI.getRegClass(MI.getOperand(0).getReg())); + MRI.replaceRegWith(MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + LIS.RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + Changed = true; + } + } + } +} + +// A simple class that takes identical copies of a single basic block and allows +// querying of equivalent instructions at a later point when the blocks are no +// longer identical. +class EquivalenceRegistry { +public: + EquivalenceRegistry() {} + void insert(MachineBasicBlock *MBB) { + unsigned Idx = 0; + for (auto &MI : *MBB) { + Insts[{MBB, Idx}] = &MI; + Indexes[&MI] = Idx++; + } + } + + MachineInstr *lookup(MachineInstr *MI, MachineBasicBlock *MBB) { + unsigned Idx = Indexes[MI]; + return Insts[{MBB, Idx}]; + } + + MachineOperand *lookup(MachineOperand *MO, MachineBasicBlock *MBB) { + unsigned Idx = MO->getParent()->findRegisterDefOperandIdx(MO->getReg()); + return &lookup(MO->getParent(), MBB)->getOperand(Idx); + } + + bool contains(MachineInstr *MI) { return Indexes.count(MI); } + +private: + DenseMap, MachineInstr *> Insts; + DenseMap Indexes; +}; + +// A CGBlock wraps a MachineBasicBlock with a copy of the scheduled kernel. +// One prototypical CGBlock is created from the original unpipelined loop and +// the SMSchedule. This can then be duplicated to form the prolog and epilog +// code. +class CGBlock { +public: + // Create a ScheduledBlock from the original loop in BB. BB is rewritten + // in-place to adhere to Schedule. Where schedule finalization has updated + // instructions to use pre-inc values (NewMIs) the rewritten block uses these + // updates. + CGBlock(MachineBasicBlock *BB, SMSchedule &Schedule, SwingSchedulerDAG &DAG, + LiveIntervals &LIS, EquivalenceRegistry &ER); + + // Peel this loop in the direction given by LPD. + CGBlock peel(LoopPeelDirection LPD); + + // Mark all instructions that are not in the set LiveStages as Pruned. Pruning + // does not happen in this function. + void setLiveStages(ArrayRef LiveStages); + // Rewrite all uses of pruned-out instructions. + void rewritePrunedInstUses(MachineBasicBlock *LoopExit); + // Actually prune all instructions marked as Pruned by setLiveStages. + void prune(); + void addPredecessor(CGBlock &PredCGB); + void setSinglePredecessor(CGBlock &PredCGB); + void insertBranchToSuccessors(ArrayRef Cond); + + MachineBasicBlock *getBB() { return BB; } + +private: + struct InstrInfo { + InstrInfo() = default; + InstrInfo(int Stage, int Cycle, unsigned Index, bool Prune) : + Stage(Stage), Cycle(Cycle), Index(Index), Prune(Prune) {} + // The stage this instruction is scheduled in. + int Stage; + // The cycle this instruction is scheduled in. + int Cycle; + // The linear order of this instruction. Cycle only provides a partial + // order - this provides a total order. + unsigned Index; + // True if this instruction should be pruned out of this block. + bool Prune = false; + }; + + // Private constructor for use by peel(). + CGBlock(MachineBasicBlock *BB, const CGBlock &CGB, + DenseMap InstrInfos); + + // Reg is used by MI. Return the new register MI should use to adhere to the + // schedule. Insert phis as necessary. + Register remapUse(Register Reg, MachineInstr &MI); + // Insert a phi that carries LoopReg from the loop body and InitReg otherwise. + // If InitReg is not given it is chosen arbitrarily. It will either be undef + // or will be chosen so as to share another phi. + Register phi(Register LoopReg, Optional InitReg = {}, + const TargetRegisterClass *RC = nullptr); + // Create an undef register of the given register class. + Register undef(const TargetRegisterClass *RC); + + MachineBasicBlock *BB; + SMSchedule &Schedule; + SwingSchedulerDAG &DAG; + MachineRegisterInfo &MRI; + const TargetInstrInfo *TII; + LiveIntervals &LIS; + EquivalenceRegistry &ER; + + MachineBasicBlock *PreheaderBB; + DenseMap InstrInfos; + // Map from register class to canonical undef register for that class. + DenseMap Undefs; + // Map from to phi register for all created phis. Note that + // this map is only used when InitReg is non-undef. + DenseMap, Register> Phis; + // Map from LoopReg to phi register where the InitReg is undef. + DenseMap UndefPhis; +}; + +struct ReduceLoopCountState { + ReduceLoopCountState(MachineBasicBlock *Preheader, + MachinePipeliner::LoopInfo &LI, unsigned NumPeeled) + : Preheader(Preheader), LI(LI), NumPeeled(NumPeeled) {} + SmallVector Cond; + + MachineBasicBlock *Preheader; + MachinePipeliner::LoopInfo &LI; + unsigned PrevLC = UINT_MAX; + unsigned FirstLC = UINT_MAX; + SmallVector PrevInsts; + unsigned NumPeeled; + unsigned I = 0; +}; + +Optional DecrementLoopCount(MachineBasicBlock *BB, + ReduceLoopCountState &S, + const TargetInstrInfo *TII) { + if (S.PrevLC == 0) + return S.FirstLC; + S.Cond.clear(); + unsigned LC = TII->reduceLoopCount(*BB, *S.Preheader, S.LI.LoopInductionVar, + *S.LI.LoopCompare, S.Cond, S.PrevInsts, + S.NumPeeled - 1 - S.I++, S.NumPeeled - 1); + if (S.FirstLC == UINT_MAX) + S.FirstLC = LC; + S.PrevLC = LC; + return Register::isVirtualRegister(LC) ? Optional() : S.FirstLC; +} +} // namespace + +void SwingSchedulerDAG::experimentalGeneratePipeline(SMSchedule &Schedule) { + EquivalenceRegistry ER; + MachineBasicBlock *PreheaderBB = *BB->pred_begin(); + if (PreheaderBB == BB) + PreheaderBB = *std::next(BB->pred_begin()); + MachineBasicBlock *LoopExitBB = Loop.getExitBlock(); + + std::list CGBlocks; + SmallVector Prologs, Epilogs; + std::list::iterator Kernel; + unsigned NumStages = Schedule.getMaxStageCount() + 1; + unsigned NumPrologs = NumStages - 1; + + // Create the first CGBlock, which is a rescheduled loop BB. + CGBlocks.emplace_back(BB, Schedule, *this, LIS, ER); + Kernel = CGBlocks.begin(); + LLVM_DEBUG({ + dbgs() << "Initial rewritten kernel: " << *Kernel->getBB(); + }); + + // Create the prologs, peeling in reverse. + SmallVector LiveStages; + for (unsigned I = 0; I < NumPrologs; ++I) { + LiveStages.push_back(I); + auto It = CGBlocks.insert(Kernel, Kernel->peel(LPD_Front)); + It->setLiveStages(LiveStages); + Prologs.push_back(&*It); + } + + // The epilogs end up peeling off the last iterations. So Epilog 0 performs + // stage NumStages-1, Epilog 1 performs stage NumStages-2 THEN NumStages-1. + for (unsigned I = 0; I < NumPrologs; ++I) { + unsigned StartStage = NumStages - 1 - I; + CGBlocks.insert(std::next(Kernel), Kernel->peel(LPD_Back)); + for (unsigned Stage = StartStage + 1; Stage < NumStages; ++Stage) { + CGBlocks.insert(std::next(Kernel), Kernel->peel(LPD_Back)); + } + } + auto EBI = std::next(Kernel); + for (unsigned I = 0; I < NumPrologs; ++I) { + unsigned StartStage = NumStages - 1 - I; + Epilogs.push_back(&*EBI); + EBI++->setLiveStages({StartStage}); + for (unsigned Stage = StartStage + 1; Stage < NumStages; ++Stage) { + EBI++->setLiveStages({Stage}); + } + } + + // Stitch the epilogs to the prologs, working outwards from the kernel. + auto EI = Epilogs.begin(); + auto PI = Prologs.rbegin(); + unsigned N = 0; + ReduceLoopCountState RLCS(PreheaderBB, Pass.LI, NumPrologs); + for (; EI != Epilogs.end(); ++EI, ++PI, ++N) { + CGBlock &PB = **PI; + CGBlock &EB = **EI; + TII->removeBranch(*PB.getBB()); + Optional TripCount = DecrementLoopCount(PB.getBB(), RLCS, TII); + if (!TripCount.hasValue()) { + // Unknown trip count; plant predecessors. + PB.getBB()->addSuccessor(EB.getBB()); + EB.addPredecessor(PB); + TII->insertBranch(*PB.getBB(), EB.getBB(), *PB.getBB()->succ_begin(), + RLCS.Cond, DebugLoc()); + } else if (*TripCount < Prologs.size() - N) { + // Trip count is known to be bounded by this prolog block, so skip all + // other prolog blocks and the kernel, go directly to the epilog. + assert(PB.getBB()->succ_size() == 1); + PB.getBB()->removeSuccessor(PB.getBB()->succ_begin()); + PB.getBB()->addSuccessor(EB.getBB()); + EB.setSinglePredecessor(PB); + TII->insertBranch(*PB.getBB(), EB.getBB(), nullptr, {}, DebugLoc()); + } else { + // Trip count is known and is known to be larger than this prolog block, + // so go directly to the next prolog. + TII->insertBranch(*PB.getBB(), *PB.getBB()->succ_begin(), nullptr, {}, + DebugLoc()); + } + } + + // Rewrite uses between them in reverse order so that we see uses before defs. + for (CGBlock &CGB : reverse(CGBlocks)) + CGB.rewritePrunedInstUses(LoopExitBB); + + // Now all uses are rewritten, remove any instructions from stages that are + // not live. + for (CGBlock &CGB : reverse(CGBlocks)) + CGB.prune(); + + LLVM_DEBUG({ + auto It = CGBlocks.begin(); + dbgs() << "Prologs:\n"; + for (unsigned I = 0; I < NumPrologs; ++I) + It++->getBB()->dump(); + dbgs() << "Kernel:\n"; + It++->getBB()->dump(); + dbgs() << "Epilogs:\n"; + while (It != CGBlocks.end()) + It++->getBB()->dump(); + }); + + // Do a final optimization pass on the kernel. + SmallVector EpilogBBs; + for (auto I = std::next(Kernel); I != CGBlocks.end(); ++I) + EpilogBBs.push_back(I->getBB()); + splitLifetimes(Kernel->getBB(), EpilogBBs, Schedule); + removeDeadInstructions(Kernel->getBB(), EpilogBBs); + + for (CGBlock &CGB : CGBlocks) { + if (CGB.getBB()->pred_empty() || CGB.getBB()->succ_empty()) + CGB.getBB()->eraseFromParent(); + } + if (Kernel->getBB()->succ_size() == 1) + Kernel->getBB()->eraseFromParent(); + + // NewMIs have been added to BB, so don't delete them. + NewMIs.clear(); +} + +CGBlock::CGBlock(MachineBasicBlock *BB, const CGBlock &CGB, + DenseMap NewInstrInfos) + : BB(BB), Schedule(CGB.Schedule), DAG(CGB.DAG), MRI(DAG.MRI), TII(DAG.TII), + LIS(CGB.LIS), ER(CGB.ER), InstrInfos(std::move(NewInstrInfos)) { + ER.insert(BB); + for (auto &II : InstrInfos) + II.second.Prune = false; +} + +CGBlock::CGBlock(MachineBasicBlock *BB, SMSchedule &Schedule, + SwingSchedulerDAG &DAG, LiveIntervals &LIS, + EquivalenceRegistry &ER) + : BB(BB), Schedule(Schedule), DAG(DAG), MRI(DAG.MRI), TII(DAG.TII), + LIS(LIS), ER(ER) { + // Discover this block's preheader. + PreheaderBB = *BB->pred_begin(); + if (PreheaderBB == BB) + PreheaderBB = *std::next(BB->pred_begin()); + + // Rearrange the loop to be in schedule order. Note that the schedule may + // contain instructions that are not owned by the loop block (InstrChanges and + // friends), so we gracefully handle unowned instructions and delete any + // instructions that weren't in the schedule. + auto InsertPt = BB->getFirstTerminator(); + unsigned Index = 0; + MachineInstr *FirstMI = nullptr; + + for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle(); + ++Cycle) + for (SUnit *SU : Schedule.getInstructions(Cycle)) { + auto *MI = SU->getInstr(); + InstrInfos[MI] = {Schedule.stageScheduled(SU), Cycle, Index++, false}; + if (MI->isPHI()) + continue; + if (MI->getParent()) + MI->removeFromParent(); + BB->insert(InsertPt, MI); + if (!FirstMI) + FirstMI = MI; + } + + // At this point all of the scheduled instructions are between FirstMI + // and the end of the block. Kill from the first non-phi to FirstMI. + for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) { + LIS.RemoveMachineInstrFromMaps(*I); + (I++)->eraseFromParent(); + } + + // Now remap every instruction in the loop. + for (MachineInstr &MI : *BB) { + if (MI.isPHI()) + continue; + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit()) + continue; + Register Reg = remapUse(MO.getReg(), MI); + MO.setReg(Reg); + } + } + EliminateDeadPhis(BB, MRI, LIS); + + // Ensure a phi exists for all instructions that are either referenced by + // an illegal phi or by an instruction outside the loop. This allows us to + // treat remaps of these values the same as "normal" values that come from + // loop-carried phis. + for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) { + if (MI->isPHI()) { + Register R = MI->getOperand(0).getReg(); + phi(R); + continue; + } + + for (MachineOperand &Def : MI->defs()) { + for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) { + if (MI.getParent() != BB) { + phi(Def.getReg()); + break; + } + } + } + } + + ER.insert(BB); +} + +Register CGBlock::remapUse(Register Reg, MachineInstr &MI) { + MachineInstr *Producer = MRI.getUniqueVRegDef(Reg); + if (!Producer) + return Reg; + + int ConsumerStage = InstrInfos[&MI].Stage; + if (!Producer->isPHI()) { + // Non-phi producers are simple to remap. Insert as many phis as the + // difference between the consumer and producer stages. + if (InstrInfos.count(Producer) == 0) + // Producer was not inside the loop. Use the register as-is. + return Reg; + int ProducerStage = InstrInfos[Producer].Stage; + assert(ConsumerStage != -1 && + "In-loop consumer should always be scheduled!"); + assert(ConsumerStage >= ProducerStage); + unsigned StageDiff = ConsumerStage - ProducerStage; + + for (unsigned I = 0; I < StageDiff; ++I) + Reg = phi(Reg); + return Reg; + } + + // First, dive through the phi chain to find the defaults for the generated + // phis. + SmallVector, 4> Defaults; + Register LoopReg = Reg; + auto LoopProducer = Producer; + while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) { + LoopReg = getLoopPhiReg(*LoopProducer, BB); + Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB)); + LoopProducer = MRI.getUniqueVRegDef(LoopReg); + assert(LoopProducer); + } + int LoopProducerStage = + InstrInfos.count(LoopProducer) ? InstrInfos[LoopProducer].Stage : -1; + int LoopProducerCycle = + InstrInfos.count(LoopProducer) ? InstrInfos[LoopProducer].Cycle : -1; + int ConsumerCycle = InstrInfos[&MI].Cycle; + + int NumPhis; + if (LoopProducerStage == -1) { + NumPhis = Defaults.size(); + } else { + // Calculate the difference between producer and consumer stages in modulo + // arithmetic. Add NumStages to ConsumerStage so that it is always > + // LoopProducerStage, and take the mod NumStages. + int NumStages = Schedule.getMaxStageCount() + 1; + int StageDiff = (ConsumerStage + NumStages) - LoopProducerStage; + NumPhis = Defaults.size() + StageDiff % NumStages; + } + + if (NumPhis > (int)Defaults.size()) + LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size() + << " to " << NumPhis << "\n"); + // If we need more phis than we have defaults for, pad out with undefs for the + // earliest phis, which are at the end of the defaults chain (the chain is in + // reverse order). + Defaults.resize(NumPhis, + Defaults.empty() ? Optional() : Defaults.back()); + + LLVM_DEBUG(dbgs() << "Inserting " << NumPhis << " phis for use of %" + << Reg.virtRegIndex() << " in " << MI); + auto DefaultI = Defaults.rbegin(); + if (ConsumerStage < LoopProducerStage && LoopProducerCycle < ConsumerCycle) { + // The consumer optionally consumes LoopProducer in the same iteration + // (because the producer is scheduled at an earlier cycle than the consumer) + // or the initial value. To facilitate this we create an illegal block here + // by embedding a phi in the middle of the block. We will fix this up + // immediately prior to pruning. + auto RC = MRI.getRegClass(Reg); + Register R = MRI.createVirtualRegister(RC); + BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R) + .addReg(DefaultI++->getValue()) + .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect. + .addReg(LoopReg) + .addMBB(BB); // Block choice is arbitrary and has no effect. + return R; + } + + // Now we know the number of stages to jump back, insert the phi chain. + while (DefaultI != Defaults.rend()) + LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg)); + return LoopReg; +} + +Register CGBlock::phi(Register LoopReg, Optional InitReg, + const TargetRegisterClass *RC) { + // If the init register is not undef, try and find an existing phi. + if (InitReg.hasValue()) { + auto I = Phis.find({LoopReg, InitReg.getValue()}); + if (I != Phis.end()) + return I->second; + } else { + for (auto &KV : Phis) { + if (KV.first.first == LoopReg) + return KV.second; + } + } + + // InitReg is either undef or no existing phi takes InitReg as input. Try and + // find a phi that takes undef as input. + auto I = UndefPhis.find(LoopReg); + if (I != UndefPhis.end()) { + Register R = I->second; + if (!InitReg.hasValue()) + // Found a phi taking undef as input, and this input is undef so return + // without any more changes. + return R; + // Found a phi taking undef as input, so rewrite it to take InitReg. + MachineInstr *MI = MRI.getVRegDef(R); + MI->getOperand(1).setReg(InitReg.getValue()); + Phis.insert({{LoopReg, InitReg.getValue()}, R}); + MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue())); + UndefPhis.erase(I); + return R; + } + + // Failed to find any existing phi to reuse, so create a new one. + if (!RC) + RC = MRI.getRegClass(LoopReg); + Register R = MRI.createVirtualRegister(RC); + if (InitReg.hasValue()) + MRI.constrainRegClass(R, MRI.getRegClass(*InitReg)); + BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R) + .addReg(InitReg.hasValue() ? *InitReg : undef(RC)) + .addMBB(PreheaderBB) + .addReg(LoopReg) + .addMBB(BB); + if (!InitReg.hasValue()) + UndefPhis[LoopReg] = R; + else + Phis[{LoopReg, *InitReg}] = R; + return R; +} + +Register CGBlock::undef(const TargetRegisterClass *RC) { + Register &R = Undefs[RC]; + if (R == 0) { + // Create an IMPLICIT_DEF that defines this register if we need it. + // All uses of this should be removed by the time we have finished unrolling + // prologs and epilogs. + R = MRI.createVirtualRegister(RC); + auto *InsertBB = &PreheaderBB->getParent()->front(); + BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), R); + } + return R; +} + +CGBlock CGBlock::peel(LoopPeelDirection LPD) { + MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII); + auto OI = BB->begin(); + auto NI = NewBB->begin(); + DenseMap NewInstrInfos; + for (; OI != BB->end(); ++OI, ++NI) { + if (InstrInfos.count(&*OI)) + NewInstrInfos.insert({&*NI, InstrInfos[&*OI]}); + } + return CGBlock(NewBB, *this, std::move(NewInstrInfos)); +} + +void CGBlock::setLiveStages(ArrayRef LiveStages) { + for (auto &KV : InstrInfos) { + if (KV.second.Stage != -1 && + find(LiveStages, KV.second.Stage) == LiveStages.end()) + KV.second.Prune = true; + } +} + +// Return a PHI that uses R in BB. +static MachineInstr *findPhiProviding(Register R, MachineBasicBlock *BB) { + for (auto I = BB->begin(); I != BB->end() && I->isPHI(); ++I) { + if (I->findRegisterUseOperand(R) != nullptr) + return &*I; + } + llvm_unreachable("findPhiProviding failed to find an existing phi!"); +} + +void CGBlock::rewritePrunedInstUses(MachineBasicBlock *LoopExit) { + // Rewrite all uses of predicated-out instructions. + for (MachineInstr &MI : *BB) { + if (MI.isPHI() || InstrInfos.count(&MI) == 0) + continue; + if (!InstrInfos[&MI].Prune) + // Predicate is live. + continue; + // This instruction is predicated out. Any uses outside this block must be + // rewritten. + LLVM_DEBUG(dbgs() << " - predicated out: " << MI); + SmallVector ToRewrite; + for (MachineOperand &DefMO : MI.defs()) { + for (auto &MO : MRI.use_operands(DefMO.getReg())) { + if (MO.getParent()->getParent() != BB) + ToRewrite.push_back(&MO); + } + } + for (MachineOperand *UseMO : ToRewrite) { + MachineInstr *UseMI = UseMO->getParent(); + if (!ER.contains(UseMI)) { + MachineOperand &DefMO = *MRI.def_begin(UseMO->getReg()); + Register PredR = ER.lookup(&DefMO, *BB->pred_begin())->getReg(); + auto Phi = findPhiProviding(PredR, BB); + UseMO->setReg(Phi->getOperand(0).getReg()); + continue; + } + // Only PHIs can use values from this block by construction. + assert(UseMI->isPHI()); + // Match with the equivalent PHI in B. + MachineInstr *BMI = ER.lookup(UseMI, BB); + assert(BMI->isPHI()); + UseMO->setReg(BMI->getOperand(0).getReg()); + } + } + + // Rewrite the illegal PHIs we may have made in remapUse. These PHIs were + // created to represent a loop-carried dependency that, due to scheduling, + // requires zero PHIs inserted (producer is scheduled before consumer, but + // consumer's stage is after producer's). + for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) { + if (!MI->isPHI()) + continue; + // Each illegal phi has exactly one use by construction. If that use is + // pruned, we don't need to rewrite the phi. + MachineInstr *UseMI = &*MRI.use_instr_begin(MI->getOperand(0).getReg()); + Register LoopCarriedReg = MI->getOperand(3).getReg(); + + if (InstrInfos[UseMI].Prune) + continue; + // This is an illegal PHI inside a block. If the loop-carried value comes + // from this block and is predicated out we must find the previous version + // of that loop-carried value. + MachineInstr *LoopCarriedInst = MRI.getUniqueVRegDef(LoopCarriedReg); + if (!LoopCarriedInst || LoopCarriedInst->isPHI() || + !InstrInfos[LoopCarriedInst].Prune) + continue; + + MachineInstr *PredMI = ER.lookup(&*MI, *BB->pred_begin()); + if (!PredMI) + continue; + MachineInstr *Phi = findPhiProviding(PredMI->getOperand(0).getReg(), BB); + // Update the loop-carried value of the illegal PHI. Don't erase it yet + // until the next pass when all values have been rewritten. + MI->getOperand(3).setReg(Phi->getOperand(0).getReg()); + } +} + +void CGBlock::prune() { + // As a final pass, fix up any temporary illegal phis we created. These phis + // were created in the middle of the block to switch between either a + // loop-carried value computed in this block if available, or a default value + // if the loop-carried value is pruned out. + for (auto I = BB->getFirstNonPHI(); I != BB->end();) { + MachineInstr *MI = &*I++; + if (!MI->isPHI()) + continue; + Register PhiR = MI->getOperand(0).getReg(); + // The second PHI input is the desired value. + Register R = MI->getOperand(3).getReg(); + MachineInstr *RMI = MRI.getUniqueVRegDef(R); + assert(RMI->isPHI() || InstrInfos.count(RMI) && + "We shouldn't have created a phi in this case!"); + if (!RMI->isPHI() && InstrInfos[RMI].Prune) + // Desired register will be pruned out, so use the init register instead. + R = MI->getOperand(1).getReg(); + MRI.replaceRegWith(PhiR, R); + MRI.setRegClass(R, MRI.getRegClass(PhiR)); + MI->eraseFromParent(); + } + + for (auto &KV : InstrInfos) { + if (!KV.first->isPHI() && KV.second.Prune) { + LIS.RemoveMachineInstrFromMaps(*KV.first); + KV.first->eraseFromParent(); + } + } + EliminateDeadPhis(BB, MRI, LIS); +} + +void CGBlock::addPredecessor(CGBlock &PredCGB) { + // PredBB is our current unique predecessor. + MachineBasicBlock *PredBB = *BB->pred_begin(); + // OtherPredBB is the BB that we are adding as a predecessor. + MachineBasicBlock *OtherPredBB = PredCGB.getBB(); + + // For every PHI, we have only one value which must come from PredBB due to + // how the loop blocks were peeled. Find the equivalent value in OtherPredBB. + // This is easy because all peeled blocks are identical, so we can just use + // the offset into PredBB. + for (MachineInstr &MI : BB->phis()) { + Register PredReg = MI.getOperand(1).getReg(); + Register Reg = PredReg; + MachineInstr *PredDefMI = MRI.getUniqueVRegDef(PredReg); + if (PredDefMI->getParent() == PredBB) { + MachineInstr *OtherDefMI = ER.lookup(PredDefMI, OtherPredBB); + assert(OtherDefMI->getOpcode() == PredDefMI->getOpcode()); + + unsigned MOOffset = PredDefMI->findRegisterDefOperandIdx(PredReg); + Reg = OtherDefMI->getOperand(MOOffset).getReg(); + } + MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false)); + MI.addOperand(MachineOperand::CreateMBB(OtherPredBB)); + } +} + +void CGBlock::setSinglePredecessor(CGBlock &PredCGB) { + // PredBB is our current unique predecessor. + MachineBasicBlock *PredBB = *BB->pred_begin(); + // OtherPredBB is the BB that we are adding as a predecessor. + MachineBasicBlock *OtherPredBB = PredCGB.getBB(); + PredBB->removeSuccessor(BB); + + // For every PHI, we have only one value which must come from PredBB due to + // how the loop blocks were peeled. Find the equivalent value in OtherPredBB. + for (MachineInstr &MI : BB->phis()) { + Register PredReg = MI.getOperand(1).getReg(); + Register Reg = PredReg; + MachineInstr *PredDefMI = MRI.getUniqueVRegDef(PredReg); + if (PredDefMI->getParent() == PredBB) { + MachineInstr *OtherDefMI = ER.lookup(PredDefMI, OtherPredBB); + assert(OtherDefMI->getOpcode() == PredDefMI->getOpcode()); + Reg = OtherDefMI->getOperand(0).getReg(); + } + MI.getOperand(1).setReg(Reg); + MI.getOperand(2).setMBB(OtherPredBB); + } +} Index: test/CodeGen/Hexagon/swp-epilog-phi11.ll =================================================================== --- test/CodeGen/Hexagon/swp-epilog-phi11.ll +++ test/CodeGen/Hexagon/swp-epilog-phi11.ll @@ -1,13 +1,24 @@ -; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s - +; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 -swp-experimental-cg=false < %s | FileCheck %s --check-prefix=LEGACY +; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 -swp-experimental-cg=true < %s | FileCheck %s --check-prefix=NEW ; Test that the pipeliner correctly generates the operands in the ; epilog. -; CHECK: loop0 -; CHECK: r{{[0-9]+}} = sfsub([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]]) -; CHECK: endloop0 -; CHECK: r{{[0-9]+}} = sfsub([[REG0]],[[REG1]]) -; CHECK: r{{[0-9]+}} = sfsub([[REG0]],r{{[0-9]+}}) +; LEGACY: loop0 +; LEGACY: r{{[0-9]+}} = sfsub([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]]) +; LEGACY: endloop0 +; LEGACY: r{{[0-9]+}} = sfsub([[REG0]],[[REG1]]) +; LEGACY: r{{[0-9]+}} = sfsub([[REG0]],r{{[0-9]+}}) + +; New code generation uses 3 rather than 6 phis, which causes one fewer combine +; but one extra move. Code is equivalent. +; NEW: loop0 +; NEW: [[REG1:r([0-9]+)]] = {{r([0-9]+)}} +; NEW: [[REG2:r([0-9]+)]] = [[REG3:r([0-9]+)]] +; NEW: sfsub([[REG0:r([0-9]+)]],[[REG2]]) +; NEW: [[REG0]] = [[REG1]] +; NEW: endloop0 +; NEW: sfsub([[REG1]],[[REG3]]) + define dso_local void @test(i32 %m) local_unnamed_addr #0 { entry: Index: test/CodeGen/Hexagon/swp-epilog-phi6.ll =================================================================== --- test/CodeGen/Hexagon/swp-epilog-phi6.ll +++ test/CodeGen/Hexagon/swp-epilog-phi6.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -O2 -debug-only=pipeliner -hexagon-initial-cfg-cleanup=0 < %s -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc -march=hexagon -O2 -debug-only=pipeliner -hexagon-initial-cfg-cleanup=0 -swp-experimental-cg=false < %s -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=LEGACY +; RUN: llc -march=hexagon -O2 -debug-only=pipeliner -hexagon-initial-cfg-cleanup=0 -swp-experimental-cg=true < %s -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=NEW ; REQUIRES: asserts ; Test that the phi in the first epilog block is getter the correct @@ -7,11 +8,17 @@ ; We need to use the kernel's phi value (if the Phi in the kernel is the ; last definition). -; CHECK: New block -; CHECK: %[[REG:([0-9]+)]]:intregs = PHI %{{.*}}, %[[REG1:([0-9]+)]] -; CHECK: %[[REG1]]:intregs = nuw A2_addi -; CHECK: epilog: -; CHECK: %{{[0-9]+}}:intregs = PHI %{{.*}}, %[[REG]] +; LEGACY: New block +; LEGACY: %[[REG:([0-9]+)]]:intregs = PHI %{{.*}}, %[[REG1:([0-9]+)]] +; LEGACY: %[[REG1]]:intregs = nuw A2_addi +; LEGACY: epilog: +; LEGACY: %{{[0-9]+}}:intregs = PHI %{{.*}}, %[[REG]] + +; NEW: Kernel: +; NEW: %[[REG:([0-9]+)]]:intregs = PHI %{{.*}}, %[[REG1:([0-9]+)]] +; NEW: %[[REG1]]:intregs = nuw A2_addi +; Note: The eventual PHI in this case is unused, so is dead-code eliminated by +; the new code generator. define void @f0(i32 %a0, i32 %a1) #0 { b0: Index: test/CodeGen/Hexagon/swp-epilog-phi7.ll =================================================================== --- test/CodeGen/Hexagon/swp-epilog-phi7.ll +++ test/CodeGen/Hexagon/swp-epilog-phi7.ll @@ -1,4 +1,8 @@ -; RUN: llc -march=hexagon -O2 -enable-pipeliner -disable-block-placement=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -O2 -enable-pipeliner -disable-block-placement=0 -swp-experimental-cg=false < %s | FileCheck %s + +; NOTE: This test is seemingly code-generated correctly with +; -swp-experimental-cg, but the output (in both cases) has been mangled by block +; placement which makes pattern matching nontrivial. ; For the Phis generated in the epilog, test that we generate the correct ; names for the values coming from the prolog stages. The test belows Index: test/CodeGen/Hexagon/swp-matmul-bitext.ll =================================================================== --- test/CodeGen/Hexagon/swp-matmul-bitext.ll +++ test/CodeGen/Hexagon/swp-matmul-bitext.ll @@ -1,7 +1,10 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner -swp-experimental-cg=false < %s | FileCheck %s ; From coremark. Test that we pipeline the matrix multiplication bitextract ; function. The pipelined code should have two packets. +; FIXME: Experimental codegen disabled for this test; we produce three packets. +; The generated loop at the end of pipelining looks identical except for +; memoperands not being updated. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: Index: test/CodeGen/Hexagon/swp-xxh2.ll =================================================================== --- test/CodeGen/Hexagon/swp-xxh2.ll +++ test/CodeGen/Hexagon/swp-xxh2.ll @@ -1,15 +1,22 @@ -; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null -swp-experimental-cg=false | FileCheck %s --check-prefix=LEGACY +; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null -swp-experimental-cg=true | FileCheck %s --check-prefix=NEW ; REQUIRES: asserts ; Fix bug when pipelining xxh benchmark at O3, mv55, and with vectorization. ; The problem is choosing the correct name for the Phis in the epilog. -; CHECK: New block -; CHECK: %{{.*}}, %[[REG:([0-9]+)]]{{.*}} = L2_loadri_pi -; CHECK: epilog: -; CHECK: = PHI -; CHECK-NOT: = PHI %{{[0-9]+}}, {{.*}}, %[[REG]] -; CHECK: = PHI +; LEGACY: New block +; LEGACY: %{{.*}}, %[[REG:([0-9]+)]]{{.*}} = L2_loadri_pi +; LEGACY: epilog: +; LEGACY: = PHI +; LEGACY-NOT: = PHI %{{[0-9]+}}, {{.*}}, %[[REG]] +; LEGACY: = PHI + +; NEW: Kernel: +; NEW: %{{.*}}, %[[REG:([0-9]+)]]{{.*}} = L2_loadri_pi +; NEW: Epilogs: +; NEW: %[[REG]] +; NEW: = PHI ; Function Attrs: nounwind define void @f0(i32 %a0, i32* %a1) #0 { Index: test/CodeGen/PowerPC/sms-phi.ll =================================================================== --- test/CodeGen/PowerPC/sms-phi.ll +++ test/CodeGen/PowerPC/sms-phi.ll @@ -1,6 +1,7 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\ ; RUN: -mcpu=pwr9 --ppc-enable-pipeliner -debug-only=pipeliner 2>&1 \ +; RUN: -swp-experimental-cg=false \ ; RUN: >/dev/null | FileCheck %s define dso_local void @sha512() #0 { ;CHECK: prolog: @@ -9,7 +10,8 @@ ;CHECK: %23:g8rc_and_g8rc_nox0 = PHI %5:g8rc_and_g8rc_nox0, %bb.3, %18:g8rc_and_g8rc_nox0, %bb.4 ;CHECK-NEXT: %24:g8rc = PHI %6:g8rc, %bb.3, %16:g8rc, %bb.4 ;CHECK-NEXT: %25:g8rc = PHI %6:g8rc, %bb.3, %19:g8rc, %bb.4 - br label %1 + +br label %1 1: ; preds = %1, %0 %2 = phi i64 [ 0, %0 ], [ %12, %1 ] Index: test/CodeGen/PowerPC/sms-simple.ll =================================================================== --- test/CodeGen/PowerPC/sms-simple.ll +++ test/CodeGen/PowerPC/sms-simple.ll @@ -19,11 +19,11 @@ ; CHECK-NEXT: mtctr r7 ; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: lwzu r7, 12(r5) -; CHECK-NEXT: maddld r6, r7, r7, r6 -; CHECK-NEXT: lwz r7, 4(r5) +; CHECK-NEXT: maddld r[[x:[0-9]+]], r7, r7, r6 +; CHECK-NEXT: lwz r[[y:[0-9]+]], 4(r5) ; CHECK-NEXT: addi r4, r3, -8 -; CHECK-NEXT: stwu r6, 12(r4) -; CHECK-NEXT: maddld r6, r7, r7, r6 +; CHECK-DAG: stwu r[[x]], 12(r4) +; CHECK-DAG: maddld r6, r[[y]], r[[y]], r[[x]] ; CHECK-NEXT: lwz r7, 8(r5) ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.body