Index: llvm/include/llvm/CodeGen/ModuloSchedule.h =================================================================== --- llvm/include/llvm/CodeGen/ModuloSchedule.h +++ llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -140,6 +140,11 @@ /// Return the rescheduled instructions in order. ArrayRef getInstructions() { return ScheduledInstrs; } + + void dump() { + print(dbgs()); + } + void print(raw_ostream &OS); }; /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, @@ -162,6 +167,7 @@ MachineBasicBlock *BB; MachineBasicBlock *Preheader; + MachineBasicBlock *NewKernel = nullptr; /// Map for each register and the max difference between its uses and def. /// The first element in the pair is the max difference in stages. The @@ -252,6 +258,38 @@ /// Performs the actual expansion. void expand(); + /// Performs final cleanup after expansion. + void cleanup(); + + /// Returns the newly rewritten kernel block, or nullptr if this was + /// optimized away. + MachineBasicBlock *getRewrittenKernel() { return NewKernel; } +}; + +/// A reimplementation of ModuloScheduleExpander. It works by generating a +/// standalone kernel loop and peeling out the prologs and epilogs. +/// +/// FIXME: This implementation cannot yet generate valid code. It can generate +/// a correct kernel but cannot peel out prologs and epilogs. +class PeelingModuloScheduleExpander { + ModuloSchedule &Schedule; + MachineFunction &MF; + const TargetSubtargetInfo &ST; + MachineRegisterInfo &MRI; + const TargetInstrInfo *TII; + LiveIntervals *LIS; + + MachineBasicBlock *BB; + MachineBasicBlock *Preheader; +public: + PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, + LiveIntervals *LIS) + : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), + TII(ST.getInstrInfo()), LIS(LIS) {} + + /// Runs ModuloScheduleExpander and treats it as a golden input to validate + /// aspects of the code generated by PeelingModuloScheduleExpander. + void validateAgainstModuloScheduleExpander(); }; /// Expander that simply annotates each scheduled instruction with a post-instr Index: llvm/lib/CodeGen/MachinePipeliner.cpp =================================================================== --- llvm/lib/CodeGen/MachinePipeliner.cpp +++ llvm/lib/CodeGen/MachinePipeliner.cpp @@ -160,6 +160,11 @@ "with the generated schedule for feeding into the " "-modulo-schedule-test pass")); +static cl::opt ExperimentalCodeGen( + "pipeliner-experimental-cg", cl::Hidden, cl::init(false), + cl::desc( + "Use the experimental peeling code generator for software pipelining")); + namespace llvm { // A command line option to enable the CopyToPhi DAG mutation. @@ -549,8 +554,18 @@ MSTI.annotate(); return; } - ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges)); - MSE.expand(); + // The experimental code generator can't work if there are InstChanges. + if (ExperimentalCodeGen && NewInstrChanges.empty()) { + PeelingModuloScheduleExpander MSE(MF, MS, &LIS); + // Experimental code generation isn't complete yet, but it can partially + // validate the code it generates against the original + // ModuloScheduleExpander. + MSE.validateAgainstModuloScheduleExpander(); + } else { + ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges)); + MSE.expand(); + MSE.cleanup(); + } ++NumPipelined; } Index: llvm/lib/CodeGen/ModuloSchedule.cpp =================================================================== --- llvm/lib/CodeGen/ModuloSchedule.cpp +++ llvm/lib/CodeGen/ModuloSchedule.cpp @@ -7,17 +7,24 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ModuloSchedule.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "pipeliner" using namespace llvm; +void ModuloSchedule::print(raw_ostream &OS) { + for (MachineInstr *MI : ScheduledInstrs) + OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI; +} + //===----------------------------------------------------------------------===// // ModuloScheduleExpander implementation //===----------------------------------------------------------------------===// @@ -139,6 +146,7 @@ InstrMap[NewMI] = &*I; } + NewKernel = KernelBB; KernelBB->transferSuccessors(BB); KernelBB->replaceSuccessor(BB, KernelBB); @@ -163,13 +171,15 @@ // Add branches between prolog and epilog blocks. addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap); + delete[] VRMap; +} + +void ModuloScheduleExpander::cleanup() { // Remove the original loop since it's no longer referenced. for (auto &I : *BB) LIS.RemoveMachineInstrFromMaps(I); BB->clear(); BB->eraseFromParent(); - - delete[] VRMap; } /// Generate the pipeline prolog code. @@ -886,6 +896,8 @@ } LastPro->clear(); LastPro->eraseFromParent(); + if (LastPro == KernelBB) + NewKernel = nullptr; } else { numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc()); removePhis(Epilog, Prolog); @@ -1196,6 +1208,442 @@ return (LoopCycle > DefCycle) || (LoopStage <= DefStage); } +//===----------------------------------------------------------------------===// +// PeelingModuloScheduleExpander implementation +//===----------------------------------------------------------------------===// +// This is a reimplementation of ModuloScheduleExpander that works by creating +// a fully correct steady-state kernel and peeling off the prolog and epilogs. +//===----------------------------------------------------------------------===// + +namespace { +// Remove any dead phis in MBB. Dead phis either have only one block as input +// (in which case they are the identity) or have no uses. +void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI, + LiveIntervals *LIS) { + bool Changed = true; + while (Changed) { + Changed = false; + for (auto I = MBB->begin(); I != MBB->getFirstNonPHI();) { + MachineInstr &MI = *I++; + assert(MI.isPHI()); + if (MRI.use_empty(MI.getOperand(0).getReg())) { + if (LIS) + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + Changed = true; + } else if (MI.getNumExplicitOperands() == 3) { + MRI.constrainRegClass(MI.getOperand(1).getReg(), + MRI.getRegClass(MI.getOperand(0).getReg())); + MRI.replaceRegWith(MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + if (LIS) + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + Changed = true; + } + } + } +} + +/// Rewrites the kernel block in-place to adhere to the given schedule. +/// KernelRewriter holds all of the state required to perform the rewriting. +class KernelRewriter { + ModuloSchedule &S; + MachineBasicBlock *BB; + MachineBasicBlock *PreheaderBB, *ExitBB; + MachineRegisterInfo &MRI; + const TargetInstrInfo *TII; + LiveIntervals *LIS; + + // Map from register class to canonical undef register for that class. + DenseMap Undefs; + // Map from to phi register for all created phis. Note that + // this map is only used when InitReg is non-undef. + DenseMap, Register> Phis; + // Map from LoopReg to phi register where the InitReg is undef. + DenseMap UndefPhis; + + // Reg is used by MI. Return the new register MI should use to adhere to the + // schedule. Insert phis as necessary. + Register remapUse(Register Reg, MachineInstr &MI); + // Insert a phi that carries LoopReg from the loop body and InitReg otherwise. + // If InitReg is not given it is chosen arbitrarily. It will either be undef + // or will be chosen so as to share another phi. + Register phi(Register LoopReg, Optional InitReg = {}, + const TargetRegisterClass *RC = nullptr); + // Create an undef register of the given register class. + Register undef(const TargetRegisterClass *RC); + +public: + KernelRewriter(MachineLoop &L, ModuloSchedule &S, + LiveIntervals *LIS = nullptr); + void rewrite(); +}; +} // namespace + +KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S, + LiveIntervals *LIS) + : S(S), BB(L.getTopBlock()), PreheaderBB(L.getLoopPreheader()), + ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()), + TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) { + PreheaderBB = *BB->pred_begin(); + if (PreheaderBB == BB) + PreheaderBB = *std::next(BB->pred_begin()); +} + +void KernelRewriter::rewrite() { + // Rearrange the loop to be in schedule order. Note that the schedule may + // contain instructions that are not owned by the loop block (InstrChanges and + // friends), so we gracefully handle unowned instructions and delete any + // instructions that weren't in the schedule. + auto InsertPt = BB->getFirstTerminator(); + MachineInstr *FirstMI = nullptr; + for (MachineInstr *MI : S.getInstructions()) { + if (MI->isPHI()) + continue; + if (MI->getParent()) + MI->removeFromParent(); + BB->insert(InsertPt, MI); + if (!FirstMI) + FirstMI = MI; + } + + // At this point all of the scheduled instructions are between FirstMI + // and the end of the block. Kill from the first non-phi to FirstMI. + for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) { + if (LIS) + LIS->RemoveMachineInstrFromMaps(*I); + (I++)->eraseFromParent(); + } + + // Now remap every instruction in the loop. + for (MachineInstr &MI : *BB) { + if (MI.isPHI()) + continue; + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit()) + continue; + Register Reg = remapUse(MO.getReg(), MI); + MO.setReg(Reg); + } + } + EliminateDeadPhis(BB, MRI, LIS); + + // Ensure a phi exists for all instructions that are either referenced by + // an illegal phi or by an instruction outside the loop. This allows us to + // treat remaps of these values the same as "normal" values that come from + // loop-carried phis. + for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) { + if (MI->isPHI()) { + Register R = MI->getOperand(0).getReg(); + phi(R); + continue; + } + + for (MachineOperand &Def : MI->defs()) { + for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) { + if (MI.getParent() != BB) { + phi(Def.getReg()); + break; + } + } + } + } +} + +Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) { + MachineInstr *Producer = MRI.getUniqueVRegDef(Reg); + if (!Producer) + return Reg; + + int ConsumerStage = S.getStage(&MI); + if (!Producer->isPHI()) { + // Non-phi producers are simple to remap. Insert as many phis as the + // difference between the consumer and producer stages. + if (Producer->getParent() != BB) + // Producer was not inside the loop. Use the register as-is. + return Reg; + int ProducerStage = S.getStage(Producer); + assert(ConsumerStage != -1 && + "In-loop consumer should always be scheduled!"); + assert(ConsumerStage >= ProducerStage); + unsigned StageDiff = ConsumerStage - ProducerStage; + + for (unsigned I = 0; I < StageDiff; ++I) + Reg = phi(Reg); + return Reg; + } + + // First, dive through the phi chain to find the defaults for the generated + // phis. + SmallVector, 4> Defaults; + Register LoopReg = Reg; + auto LoopProducer = Producer; + while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) { + LoopReg = getLoopPhiReg(*LoopProducer, BB); + Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB)); + LoopProducer = MRI.getUniqueVRegDef(LoopReg); + assert(LoopProducer); + } + int LoopProducerStage = S.getStage(LoopProducer); + int LoopProducerCycle = S.getCycle(LoopProducer); + int ConsumerCycle = S.getCycle(&MI); + + int NumPhis; + if (LoopProducerStage == -1) { + NumPhis = Defaults.size(); + } else { + // Calculate the difference between producer and consumer stages in modulo + // arithmetic. Add NumStages to ConsumerStage so that it is always > + // LoopProducerStage, and take the mod NumStages. + int NumStages = S.getNumStages(); + int StageDiff = std::max(ConsumerStage - LoopProducerStage, 0); + NumPhis = Defaults.size() + StageDiff % NumStages; + } + + if (NumPhis > (int)Defaults.size()) + LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size() + << " to " << NumPhis << "\n"); + // If we need more phis than we have defaults for, pad out with undefs for the + // earliest phis, which are at the end of the defaults chain (the chain is in + // reverse order). + Defaults.resize(NumPhis, + Defaults.empty() ? Optional() : Defaults.back()); + + LLVM_DEBUG(dbgs() << "Inserting " << NumPhis << " phis for use of %" + << Reg.virtRegIndex() << " in " << MI); + auto DefaultI = Defaults.rbegin(); + if (ConsumerStage < LoopProducerStage && LoopProducerCycle < ConsumerCycle) { + // The consumer optionally consumes LoopProducer in the same iteration + // (because the producer is scheduled at an earlier cycle than the consumer) + // or the initial value. To facilitate this we create an illegal block here + // by embedding a phi in the middle of the block. We will fix this up + // immediately prior to pruning. + assert(!Defaults.empty()); + while (std::next(DefaultI) != Defaults.rend()) + LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg)); + + auto RC = MRI.getRegClass(Reg); + Register R = MRI.createVirtualRegister(RC); + BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R) + .addReg(DefaultI->getValue()) + .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect. + .addReg(LoopReg) + .addMBB(BB); // Block choice is arbitrary and has no effect. + return R; + } + + // Now we know the number of stages to jump back, insert the phi chain. + while (DefaultI != Defaults.rend()) + LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg)); + return LoopReg; +} + +Register KernelRewriter::phi(Register LoopReg, Optional InitReg, + const TargetRegisterClass *RC) { + // If the init register is not undef, try and find an existing phi. + if (InitReg.hasValue()) { + auto I = Phis.find({LoopReg, InitReg.getValue()}); + if (I != Phis.end()) + return I->second; + } else { + for (auto &KV : Phis) { + if (KV.first.first == LoopReg) + return KV.second; + } + } + + // InitReg is either undef or no existing phi takes InitReg as input. Try and + // find a phi that takes undef as input. + auto I = UndefPhis.find(LoopReg); + if (I != UndefPhis.end()) { + Register R = I->second; + if (!InitReg.hasValue()) + // Found a phi taking undef as input, and this input is undef so return + // without any more changes. + return R; + // Found a phi taking undef as input, so rewrite it to take InitReg. + MachineInstr *MI = MRI.getVRegDef(R); + MI->getOperand(1).setReg(InitReg.getValue()); + Phis.insert({{LoopReg, InitReg.getValue()}, R}); + MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue())); + UndefPhis.erase(I); + return R; + } + + // Failed to find any existing phi to reuse, so create a new one. + if (!RC) + RC = MRI.getRegClass(LoopReg); + Register R = MRI.createVirtualRegister(RC); + if (InitReg.hasValue()) + MRI.constrainRegClass(R, MRI.getRegClass(*InitReg)); + BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R) + .addReg(InitReg.hasValue() ? *InitReg : undef(RC)) + .addMBB(PreheaderBB) + .addReg(LoopReg) + .addMBB(BB); + if (!InitReg.hasValue()) + UndefPhis[LoopReg] = R; + else + Phis[{LoopReg, *InitReg}] = R; + return R; +} + +Register KernelRewriter::undef(const TargetRegisterClass *RC) { + Register &R = Undefs[RC]; + if (R == 0) { + // Create an IMPLICIT_DEF that defines this register if we need it. + // All uses of this should be removed by the time we have finished unrolling + // prologs and epilogs. + R = MRI.createVirtualRegister(RC); + auto *InsertBB = &PreheaderBB->getParent()->front(); + BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), R); + } + return R; +} + +namespace { +/// Describes an operand in the kernel of a pipelined loop. Characteristics of +/// the operand are discovered, such as how many in-loop PHIs it has to jump +/// through and defaults for these phis. +class KernelOperandInfo { + MachineBasicBlock *BB; + MachineRegisterInfo &MRI; + SmallVector PhiDefaults; + MachineOperand *Source; + MachineOperand *Target; + +public: + KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI, + SmallPtrSetImpl &IllegalPhis) + : MRI(MRI) { + Source = MO; + BB = MO->getParent()->getParent(); + while (isRegInLoop(MO)) { + MachineInstr *MI = MRI.getVRegDef(MO->getReg()); + if (MI->isFullCopy()) { + MO = &MI->getOperand(1); + continue; + } + if (!MI->isPHI()) + break; + // If this is an illegal phi, don't count it in distance. + if (IllegalPhis.count(MI)) { + MO = &MI->getOperand(3); + continue; + } + + Register Default = getInitPhiReg(*MI, BB); + MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1) + : &MI->getOperand(3); + PhiDefaults.push_back(Default); + } + Target = MO; + } + + bool operator==(const KernelOperandInfo &Other) const { + return PhiDefaults.size() == Other.PhiDefaults.size(); + } + + void print(raw_ostream &OS) const { + OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in " + << *Source->getParent(); + } + +private: + bool isRegInLoop(MachineOperand *MO) { + return MO->isReg() && MO->getReg().isVirtual() && + MRI.getVRegDef(MO->getReg())->getParent() == BB; + } +}; +} // namespace + +void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() { + BB = Schedule.getLoop()->getTopBlock(); + Preheader = Schedule.getLoop()->getLoopPreheader(); + + // Dump the schedule before we invalidate and remap all its instructions. + // Stash it in a string so we can print it if we found an error. + std::string ScheduleDump; + raw_string_ostream OS(ScheduleDump); + Schedule.print(OS); + OS.flush(); + + // First, run the normal ModuleScheduleExpander. We don't support any + // InstrChanges. + assert(LIS && "Requires LiveIntervals!"); + ModuloScheduleExpander MSE(MF, Schedule, *LIS, + ModuloScheduleExpander::InstrChangesTy()); + MSE.expand(); + MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel(); + if (!ExpandedKernel) { + // The expander optimized away the kernel. We can't do any useful checking. + MSE.cleanup(); + return; + } + // Before running the KernelRewriter, re-add BB into the CFG. + Preheader->addSuccessor(BB); + + // Now run the new expansion algorithm. + KernelRewriter KR(*Schedule.getLoop(), Schedule); + KR.rewrite(); + + // Collect all illegal phis that the new algorithm created. We'll give these + // to KernelOperandInfo. + SmallPtrSet IllegalPhis; + for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) { + if (NI->isPHI()) + IllegalPhis.insert(&*NI); + } + + // Co-iterate across both kernels. We expect them to be identical apart from + // phis and full COPYs (we look through both). + SmallVector, 8> KOIs; + auto OI = ExpandedKernel->begin(); + auto NI = BB->begin(); + for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) { + while (OI->isPHI() || OI->isFullCopy()) + ++OI; + while (NI->isPHI() || NI->isFullCopy()) + ++NI; + assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!"); + // Analyze every operand separately. + for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin(); + OOpI != OI->operands_end(); ++OOpI, ++NOpI) + KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis), + KernelOperandInfo(&*NOpI, MRI, IllegalPhis)); + } + + bool Failed = false; + for (auto &OldAndNew : KOIs) { + if (OldAndNew.first == OldAndNew.second) + continue; + Failed = true; + errs() << "Modulo kernel validation error: [\n"; + errs() << " [golden] "; + OldAndNew.first.print(errs()); + errs() << " "; + OldAndNew.second.print(errs()); + errs() << "]\n"; + } + + if (Failed) { + errs() << "Golden reference kernel:\n"; + ExpandedKernel->dump(); + errs() << "New kernel:\n"; + BB->dump(); + errs() << ScheduleDump; + report_fatal_error( + "Modulo kernel validation (-pipeliner-experimental-cg) failed"); + } + + // Cleanup by removing BB from the CFG again as the original + // ModuloScheduleExpander intended. + Preheader->removeSuccessor(BB); + MSE.cleanup(); +} + //===----------------------------------------------------------------------===// // ModuloScheduleTestPass implementation //===----------------------------------------------------------------------===// @@ -1283,10 +1731,12 @@ } } - ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle), std::move(Stage)); + ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle), + std::move(Stage)); ModuloScheduleExpander MSE( MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy()); MSE.expand(); + MSE.cleanup(); } //===----------------------------------------------------------------------===//