Index: llvm/trunk/include/llvm/CodeGen/MachineLoopUtils.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/MachineLoopUtils.h +++ llvm/trunk/include/llvm/CodeGen/MachineLoopUtils.h @@ -0,0 +1,41 @@ +//=- MachineLoopUtils.h - Helper functions for manipulating loops -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H +#define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H + +namespace llvm { +class MachineBasicBlock; +class MachineRegisterInfo; +class TargetInstrInfo; + +enum LoopPeelDirection { + LPD_Front, ///< Peel the first iteration of the loop. + LPD_Back ///< Peel the last iteration of the loop. +}; + +/// Peels a single block loop. Loop must have two successors, one of which +/// must be itself. Similarly it must have two predecessors, one of which must +/// be itself. +/// +/// The loop block is copied and inserted into the CFG such that two copies of +/// the loop follow on from each other. The copy is inserted either before or +/// after the loop based on Direction. +/// +/// Phis are updated and an unconditional branch inserted at the end of the +/// clone so as to execute a single iteration. +/// +/// The trip count of Loop is not updated. +MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction, + MachineBasicBlock *Loop, + MachineRegisterInfo &MRI, + const TargetInstrInfo *TII); + +} // namespace llvm + +#endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H Index: llvm/trunk/include/llvm/CodeGen/ModuloSchedule.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/ModuloSchedule.h +++ llvm/trunk/include/llvm/CodeGen/ModuloSchedule.h @@ -62,8 +62,10 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include #include namespace llvm { @@ -142,9 +144,7 @@ /// Return the rescheduled instructions in order. ArrayRef getInstructions() { return ScheduledInstrs; } - void dump() { - print(dbgs()); - } + void dump() { print(dbgs()); } void print(raw_ostream &OS); }; @@ -270,9 +270,6 @@ /// A reimplementation of ModuloScheduleExpander. It works by generating a /// standalone kernel loop and peeling out the prologs and epilogs. -/// -/// FIXME: This implementation cannot yet generate valid code. It can generate -/// a correct kernel but cannot peel out prologs and epilogs. class PeelingModuloScheduleExpander { ModuloSchedule &Schedule; MachineFunction &MF; @@ -281,17 +278,70 @@ const TargetInstrInfo *TII; LiveIntervals *LIS; + /// The original loop block that gets rewritten in-place. MachineBasicBlock *BB; + /// The original loop preheader. MachineBasicBlock *Preheader; + /// All prolog and epilog blocks. + SmallVector Prologs, Epilogs; + /// For every block, the stages that are produced. + DenseMap LiveStages; + /// For every block, the stages that are available. A stage can be available + /// but not produced (in the epilog) or produced but not available (in the + /// prolog). + DenseMap AvailableStages; + + /// CanonicalMIs and BlockMIs form a bidirectional map between any of the + /// loop kernel clones. + DenseMap CanonicalMIs; + DenseMap, MachineInstr *> + BlockMIs; + + /// State passed from peelKernel to peelPrologAndEpilogs(). + std::deque PeeledFront, PeeledBack; + public: PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, LiveIntervals *LIS) : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), TII(ST.getInstrInfo()), LIS(LIS) {} + void expand(); + /// Runs ModuloScheduleExpander and treats it as a golden input to validate /// aspects of the code generated by PeelingModuloScheduleExpander. void validateAgainstModuloScheduleExpander(); + +protected: + /// Converts BB from the original loop body to the rewritten, pipelined + /// steady-state. + void rewriteKernel(); + +private: + /// Peels one iteration of the rewritten kernel (BB) in the specified + /// direction. + MachineBasicBlock *peelKernel(LoopPeelDirection LPD); + /// Peel the kernel forwards and backwards to produce prologs and epilogs, + /// and stitch them together. + void peelPrologAndEpilogs(); + /// All prolog and epilog blocks are clones of the kernel, so any produced + /// register in one block has an corollary in all other blocks. + Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB); + /// Change all users of MI, if MI is predicated out + /// (LiveStages[MI->getParent()] == false). + void rewriteUsesOf(MachineInstr *MI); + /// Insert branches between prologs, kernel and epilogs. + void fixupBranches(); + /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block + /// to a block dominated by all prologs and epilogs. This allows us to treat + /// the loop exiting block as any other kernel clone. + MachineBasicBlock *CreateLCSSAExitingBlock(); + /// Helper to get the stage of an instruction in the schedule. + unsigned getStage(MachineInstr *MI) { + if (CanonicalMIs.count(MI)) + MI = CanonicalMIs[MI]; + return Schedule.getStage(MI); + } }; /// Expander that simply annotates each scheduled instruction with a post-instr Index: llvm/trunk/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/trunk/lib/CodeGen/CMakeLists.txt +++ llvm/trunk/lib/CodeGen/CMakeLists.txt @@ -80,6 +80,7 @@ MachineInstr.cpp MachineLICM.cpp MachineLoopInfo.cpp + MachineLoopUtils.cpp MachineModuleInfo.cpp MachineModuleInfoImpls.cpp MachineOperand.cpp Index: llvm/trunk/lib/CodeGen/MachineLoopUtils.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineLoopUtils.cpp +++ llvm/trunk/lib/CodeGen/MachineLoopUtils.cpp @@ -0,0 +1,132 @@ +//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineLoopUtils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +using namespace llvm; + +namespace { +// MI's parent and BB are clones of each other. Find the equivalent copy of MI +// in BB. +MachineInstr &findEquivalentInstruction(MachineInstr &MI, + MachineBasicBlock *BB) { + MachineBasicBlock *PB = MI.getParent(); + unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI)); + return *std::next(BB->instr_begin(), Offset); +} +} // namespace + +MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction, + MachineBasicBlock *Loop, + MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) { + MachineFunction &MF = *Loop->getParent(); + MachineBasicBlock *Preheader = *Loop->pred_begin(); + if (Preheader == Loop) + Preheader = *std::next(Loop->pred_begin()); + MachineBasicBlock *Exit = *Loop->succ_begin(); + if (Exit == Loop) + Exit = *std::next(Loop->succ_begin()); + + MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock()); + if (Direction == LPD_Front) + MF.insert(Loop->getIterator(), NewBB); + else + MF.insert(std::next(Loop->getIterator()), NewBB); + + // FIXME: Add DenseMapInfo trait for Register so we can use it as a key. + DenseMap Remaps; + auto InsertPt = NewBB->end(); + for (MachineInstr &MI : *Loop) { + MachineInstr *NewMI = MF.CloneMachineInstr(&MI); + NewBB->insert(InsertPt, NewMI); + for (MachineOperand &MO : NewMI->defs()) { + Register OrigR = MO.getReg(); + if (OrigR.isPhysical()) + continue; + Register &R = Remaps[OrigR]; + R = MRI.createVirtualRegister(MRI.getRegClass(OrigR)); + MO.setReg(R); + + if (Direction == LPD_Back) { + // Replace all uses outside the original loop with the new register. + // FIXME: is the use_iterator stable enough to mutate register uses + // while iterating? + SmallVector Uses; + for (auto &Use : MRI.use_operands(OrigR)) + if (Use.getParent()->getParent() != Loop) + Uses.push_back(&Use); + for (auto *Use : Uses) { + MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg())); + Use->setReg(R); + } + } + } + } + + for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I) + for (MachineOperand &MO : I->uses()) + if (MO.isReg() && Remaps.count(MO.getReg())) + MO.setReg(Remaps[MO.getReg()]); + + for (auto I = NewBB->begin(); I->isPHI(); ++I) { + MachineInstr &MI = *I; + unsigned LoopRegIdx = 3, InitRegIdx = 1; + if (MI.getOperand(2).getMBB() != Preheader) + std::swap(LoopRegIdx, InitRegIdx); + MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop); + assert(OrigPhi.isPHI()); + if (Direction == LPD_Front) { + // When peeling front, we are only left with the initial value from the + // preheader. + Register R = MI.getOperand(LoopRegIdx).getReg(); + if (Remaps.count(R)) + R = Remaps[R]; + OrigPhi.getOperand(InitRegIdx).setReg(R); + MI.RemoveOperand(LoopRegIdx + 1); + MI.RemoveOperand(LoopRegIdx + 0); + } else { + // When peeling back, the initial value is the loop-carried value from + // the original loop. + Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg(); + MI.getOperand(LoopRegIdx).setReg(LoopReg); + MI.RemoveOperand(InitRegIdx + 1); + MI.RemoveOperand(InitRegIdx + 0); + } + } + + DebugLoc DL; + if (Direction == LPD_Front) { + Preheader->replaceSuccessor(Loop, NewBB); + NewBB->addSuccessor(Loop); + Loop->replacePhiUsesWith(Preheader, NewBB); + if (TII->removeBranch(*Preheader) > 0) + TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL); + TII->removeBranch(*NewBB); + TII->insertBranch(*NewBB, Loop, nullptr, {}, DL); + } else { + Loop->replaceSuccessor(Exit, NewBB); + Exit->replacePhiUsesWith(Loop, NewBB); + NewBB->addSuccessor(Exit); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond); + (void)CanAnalyzeBr; + assert(CanAnalyzeBr && "Must be able to analyze the loop branch!"); + TII->removeBranch(*Loop); + TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB, + FBB == Exit ? NewBB : FBB, Cond, DL); + if (TII->removeBranch(*NewBB) > 0) + TII->insertBranch(*NewBB, Exit, nullptr, {}, DL); + } + + return NewBB; +} Index: llvm/trunk/lib/CodeGen/MachinePipeliner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachinePipeliner.cpp +++ llvm/trunk/lib/CodeGen/MachinePipeliner.cpp @@ -557,10 +557,7 @@ // The experimental code generator can't work if there are InstChanges. if (ExperimentalCodeGen && NewInstrChanges.empty()) { PeelingModuloScheduleExpander MSE(MF, MS, &LIS); - // Experimental code generation isn't complete yet, but it can partially - // validate the code it generates against the original - // ModuloScheduleExpander. - MSE.validateAgainstModuloScheduleExpander(); + MSE.expand(); } else { ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges)); MSE.expand(); Index: llvm/trunk/lib/CodeGen/ModuloSchedule.cpp =================================================================== --- llvm/trunk/lib/CodeGen/ModuloSchedule.cpp +++ llvm/trunk/lib/CodeGen/ModuloSchedule.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCContext.h" @@ -1564,6 +1565,266 @@ }; } // namespace +MachineBasicBlock * +PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) { + MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII); + if (LPD == LPD_Front) + PeeledFront.push_back(NewBB); + else + PeeledBack.push_front(NewBB); + for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator(); + ++I, ++NI) { + CanonicalMIs[&*I] = &*I; + CanonicalMIs[&*NI] = &*I; + BlockMIs[{NewBB, &*I}] = &*NI; + BlockMIs[{BB, &*I}] = &*I; + } + return NewBB; +} + +void PeelingModuloScheduleExpander::peelPrologAndEpilogs() { + BitVector LS(Schedule.getNumStages(), true); + BitVector AS(Schedule.getNumStages(), true); + LiveStages[BB] = LS; + AvailableStages[BB] = AS; + + // Peel out the prologs. + LS.reset(); + for (int I = 0; I < Schedule.getNumStages() - 1; ++I) { + LS[I] = 1; + Prologs.push_back(peelKernel(LPD_Front)); + LiveStages[Prologs.back()] = LS; + AvailableStages[Prologs.back()] = LS; + } + + // Create a block that will end up as the new loop exiting block (dominated by + // all prologs and epilogs). It will only contain PHIs, in the same order as + // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property + // that the exiting block is a (sub) clone of BB. This in turn gives us the + // property that any value deffed in BB but used outside of BB is used by a + // PHI in the exiting block. + MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock(); + + // Push out the epilogs, again in reverse order. + // We can't assume anything about the minumum loop trip count at this point, + // so emit a fairly complex epilog: + // K[0, 1, 2] // Kernel runs stages 0, 1, 2 + // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0]. + // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2. + // + // This creates a single-successor single-predecessor sequence of blocks for + // each epilog, which are kept this way for simplicity at this stage and + // cleaned up by the optimizer later. + for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) { + Epilogs.push_back(nullptr); + for (int J = Schedule.getNumStages() - 1; J >= I; --J) { + LS.reset(); + LS[J] = 1; + Epilogs.back() = peelKernel(LPD_Back); + LiveStages[Epilogs.back()] = LS; + AvailableStages[Epilogs.back()] = AS; + } + } + + // Now we've defined all the prolog and epilog blocks as a fallthrough + // sequence, add the edges that will be followed if the loop trip count is + // lower than the number of stages (connecting prologs directly with epilogs). + auto PI = Prologs.begin(); + auto EI = Epilogs.begin(); + assert(Prologs.size() == Epilogs.size()); + for (; PI != Prologs.end(); ++PI, ++EI) { + MachineBasicBlock *Pred = *(*EI)->pred_begin(); + (*PI)->addSuccessor(*EI); + for (MachineInstr &MI : (*EI)->phis()) { + Register Reg = MI.getOperand(1).getReg(); + MachineInstr *Use = MRI.getUniqueVRegDef(Reg); + if (Use && Use->getParent() == Pred) + Reg = getEquivalentRegisterIn(Reg, *PI); + MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false)); + MI.addOperand(MachineOperand::CreateMBB(*PI)); + } + } + + // Create a list of all blocks in order. + SmallVector Blocks; + llvm::copy(PeeledFront, std::back_inserter(Blocks)); + Blocks.push_back(BB); + llvm::copy(PeeledBack, std::back_inserter(Blocks)); + + // Iterate in reverse order over all instructions, remapping as we go. + for (MachineBasicBlock *B : reverse(Blocks)) { + for (auto I = B->getFirstInstrTerminator()->getReverseIterator(); + I != std::next(B->getFirstNonPHI()->getReverseIterator());) { + MachineInstr *MI = &*I++; + rewriteUsesOf(MI); + } + } + // Now all remapping has been done, we're free to optimize the generated code. + for (MachineBasicBlock *B : reverse(Blocks)) + EliminateDeadPhis(B, MRI, LIS); + EliminateDeadPhis(ExitingBB, MRI, LIS); +} + +MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() { + MachineFunction &MF = *BB->getParent(); + MachineBasicBlock *Exit = *BB->succ_begin(); + if (Exit == BB) + Exit = *std::next(BB->succ_begin()); + + MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock()); + MF.insert(std::next(BB->getIterator()), NewBB); + + // Clone all phis in BB into NewBB and rewrite. + for (MachineInstr &MI : BB->phis()) { + auto RC = MRI.getRegClass(MI.getOperand(0).getReg()); + Register OldR = MI.getOperand(3).getReg(); + Register R = MRI.createVirtualRegister(RC); + SmallVector Uses; + for (MachineInstr &Use : MRI.use_instructions(OldR)) + if (Use.getParent() != BB) + Uses.push_back(&Use); + for (MachineInstr *Use : Uses) + Use->substituteRegister(OldR, R, /*SubIdx=*/0, + *MRI.getTargetRegisterInfo()); + MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R) + .addReg(OldR) + .addMBB(BB); + BlockMIs[{NewBB, &MI}] = NI; + CanonicalMIs[NI] = &MI; + } + BB->replaceSuccessor(Exit, NewBB); + Exit->replacePhiUsesWith(BB, NewBB); + NewBB->addSuccessor(Exit); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond); + (void)CanAnalyzeBr; + assert(CanAnalyzeBr && "Must be able to analyze the loop branch!"); + TII->removeBranch(*BB); + TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB, + Cond, DebugLoc()); + TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc()); + return NewBB; +} + +Register +PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg, + MachineBasicBlock *BB) { + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg); + return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg(); +} + +void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) { + if (MI->isPHI()) { + // This is an illegal PHI. The loop-carried (desired) value is operand 3, + // and it is produced by this block. + Register PhiR = MI->getOperand(0).getReg(); + Register R = MI->getOperand(3).getReg(); + int RMIStage = getStage(MRI.getUniqueVRegDef(R)); + if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage)) + R = MI->getOperand(1).getReg(); + MRI.setRegClass(R, MRI.getRegClass(PhiR)); + MRI.replaceRegWith(PhiR, R); + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + return; + } + + int Stage = getStage(MI); + if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 || + LiveStages[MI->getParent()].test(Stage)) + // Instruction is live, no rewriting to do. + return; + + for (MachineOperand &DefMO : MI->defs()) { + SmallVector, 4> Subs; + for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) { + // Only PHIs can use values from this block by construction. + // Match with the equivalent PHI in B. + assert(UseMI.isPHI()); + Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(), + MI->getParent()); + Subs.emplace_back(&UseMI, Reg); + } + for (auto &Sub : Subs) + Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0, + *MRI.getTargetRegisterInfo()); + } + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); +} + +void PeelingModuloScheduleExpander::fixupBranches() { + std::unique_ptr Info = + TII->analyzeLoopForPipelining(BB); + assert(Info); + + // Work outwards from the kernel. + bool KernelDisposed = false; + int TC = Schedule.getNumStages() - 1; + for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend(); + ++PI, ++EI, --TC) { + MachineBasicBlock *Prolog = *PI; + MachineBasicBlock *Fallthrough = *Prolog->succ_begin(); + MachineBasicBlock *Epilog = *EI; + SmallVector Cond; + Optional StaticallyGreater = + Info->createTripCountGreaterCondition(TC, *Prolog, Cond); + if (!StaticallyGreater.hasValue()) { + LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n"); + // Dynamically branch based on Cond. + TII->removeBranch(*Prolog); + TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc()); + } else if (*StaticallyGreater == false) { + LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n"); + // Prolog never falls through; branch to epilog and orphan interior + // blocks. Leave it to unreachable-block-elim to clean up. + Prolog->removeSuccessor(Fallthrough); + for (MachineInstr &P : Fallthrough->phis()) { + P.RemoveOperand(2); + P.RemoveOperand(1); + } + TII->removeBranch(*Prolog); + TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc()); + KernelDisposed = true; + } else { + LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n"); + // Prolog always falls through; remove incoming values in epilog. + Prolog->removeSuccessor(Epilog); + for (MachineInstr &P : Epilog->phis()) { + P.RemoveOperand(4); + P.RemoveOperand(3); + } + } + } + + if (!KernelDisposed) { + Info->adjustTripCount(-(Schedule.getNumStages() - 1)); + Info->setPreheader(Prologs.back()); + } else { + Info->disposed(); + } +} + +void PeelingModuloScheduleExpander::rewriteKernel() { + KernelRewriter KR(*Schedule.getLoop(), Schedule); + KR.rewrite(); +} + +void PeelingModuloScheduleExpander::expand() { + BB = Schedule.getLoop()->getTopBlock(); + Preheader = Schedule.getLoop()->getLoopPreheader(); + LLVM_DEBUG(Schedule.dump()); + + rewriteKernel(); + peelPrologAndEpilogs(); + fixupBranches(); +} + void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() { BB = Schedule.getLoop()->getTopBlock(); Preheader = Schedule.getLoop()->getLoopPreheader(); @@ -1593,6 +1854,7 @@ // Now run the new expansion algorithm. KernelRewriter KR(*Schedule.getLoop(), Schedule); KR.rewrite(); + peelPrologAndEpilogs(); // Collect all illegal phis that the new algorithm created. We'll give these // to KernelOperandInfo. Index: llvm/trunk/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir +++ llvm/trunk/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir @@ -1,4 +1,4 @@ -# RUN: llc < %s -x mir -march=hexagon -run-pass=modulo-schedule-test | FileCheck %s +# RUN: llc < %s -x mir -march=hexagon -run-pass=modulo-schedule-test -pipeliner-experimental-cg=true | FileCheck %s # Simple check for this sanity test; ensure all instructions are in stage 0 in # the prolog and stage 3 in the epilog. Index: llvm/trunk/test/CodeGen/Hexagon/swp-art-deps-rec.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-art-deps-rec.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-art-deps-rec.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts ; RUN: llc -march=hexagon -mcpu=hexagonv65 -O3 -debug-only=pipeliner \ -; RUN: < %s 2>&1 | FileCheck %s +; RUN: < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s ; Test that the artificial dependences are ignored while computing the ; circuits. Index: llvm/trunk/test/CodeGen/Hexagon/swp-bad-sched.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-bad-sched.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-bad-sched.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -march=hexagon -enable-pipeliner -enable-aa-sched-mi < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -enable-aa-sched-mi < %s -pipeliner-experimental-cg=true | FileCheck %s ; CHECK: loop0( ; CHECK: loop0(.LBB0_[[LOOP:.]], Index: llvm/trunk/test/CodeGen/Hexagon/swp-carried-1.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-carried-1.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-carried-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct code when a loop carried value ; is scheduled one stage earlier than it's use. The code in Index: llvm/trunk/test/CodeGen/Hexagon/swp-carried-dep1.mir =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-carried-dep1.mir +++ llvm/trunk/test/CodeGen/Hexagon/swp-carried-dep1.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s # REQUIRES: asserts # Test that the loop carried dependence check correctly identifies a recurrence. Index: llvm/trunk/test/CodeGen/Hexagon/swp-carried-dep2.mir =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-carried-dep2.mir +++ llvm/trunk/test/CodeGen/Hexagon/swp-carried-dep2.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s # REQUIRES: asserts # Test that the loop carried dependence check correctly identifies a recurrence Index: llvm/trunk/test/CodeGen/Hexagon/swp-chain-refs.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-chain-refs.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-chain-refs.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=hexagon -enable-pipeliner=true -stats -o /dev/null < %s \ -; RUN: 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; Test that we do not schedule chained references too far apart, Index: llvm/trunk/test/CodeGen/Hexagon/swp-change-dep1.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-change-dep1.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-change-dep1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=1 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=1 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we update the offset correctly for loads that are ; moved past stores. In these cases, we change the dependences Index: llvm/trunk/test/CodeGen/Hexagon/swp-change-deps.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-change-deps.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-change-deps.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct offsets for loads in the prolog ; after removing dependences on a post-increment instructions of the Index: llvm/trunk/test/CodeGen/Hexagon/swp-check-offset.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-check-offset.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-check-offset.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv62 -enable-pipeliner < %s | FileCheck --check-prefix=CHECK-V62 %s -; RUN: llc -march=hexagon -mcpu=hexagonv65 -enable-pipeliner < %s | FileCheck --check-prefix=CHECK-V65 %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv62 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck --check-prefix=CHECK-V62 %s +; RUN: llc -march=hexagon -mcpu=hexagonv65 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck --check-prefix=CHECK-V65 %s ; ; Make sure we pipeline the loop and that we generate the correct Index: llvm/trunk/test/CodeGen/Hexagon/swp-const-tc1.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-const-tc1.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-const-tc1.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=hexagon -enable-pipeliner -enable-pipeliner-opt-size \ ; RUN: -verify-machineinstrs -hexagon-initial-cfg-cleanup=0 \ ; RUN: -enable-aa-sched-mi=false -hexagon-expand-condsets=0 \ -; RUN: < %s | FileCheck %s +; RUN: < %s -pipeliner-experimental-cg=true | FileCheck %s ; Disable expand-condsets because it will assert on undefined registers. Index: llvm/trunk/test/CodeGen/Hexagon/swp-const-tc2.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-const-tc2.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-const-tc2.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we fixup a pipelined loop correctly when the number of ; stages is greater than the compile-time loop trip count. In this Index: llvm/trunk/test/CodeGen/Hexagon/swp-const-tc3.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-const-tc3.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-const-tc3.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner correctly fixes up the pipelined CFG when the loop ; has a constant trip count, and the trip count is less than the number of Index: llvm/trunk/test/CodeGen/Hexagon/swp-conv3x3-nested.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; XFAIL: * ; LSR changes required. Index: llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts ; ; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \ -; RUN: 2>&1 | FileCheck %s +; RUN: 2>&1 -pipeliner-experimental-cg=true | FileCheck %s ; Test that the artificial dependence is created as a result of ; CopyToPhi DAG mutation. Index: llvm/trunk/test/CodeGen/Hexagon/swp-dep-neg-offset.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-dep-neg-offset.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-dep-neg-offset.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the code that changes the dependences does not allow ; a load with a negative offset to be overlapped with the post Index: llvm/trunk/test/CodeGen/Hexagon/swp-disable-Os.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-disable-Os.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-disable-Os.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; CHECK: loop0(.LBB0_{{[0-9]+}},#347) target triple = "hexagon" Index: llvm/trunk/test/CodeGen/Hexagon/swp-epilog-numphis.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-epilog-numphis.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-epilog-numphis.ll @@ -1,6 +1,6 @@ ; XFAIL: * ; Needs some fixed in the pipeliner. -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; CHECK: endloop0 ; CHECK: vmem Index: llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi2.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi2.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi2.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=3 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=3 < %s -pipeliner-experimental-cg=true | FileCheck %s %s.0 = type { i16, i8, i8, i16, i8, i8, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i8, i8, %s.1, [2 x [16 x %s.2]], i32 (i8*, i8*, i8*, i8*, i8*)*, %s.3*, %s.3*, [120 x i8], i8, i8, %s.4*, [2 x [120 x [8 x i8]]], [56 x i8], [2 x [121 x %s.5]], [2 x %s.5], %s.5*, %s.5*, i32, i32, i16, i8, i8, %s.7, %s.9, %s.11, %s.8*, %s.8* } %s.1 = type { i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, [16 x i8], i8, [4 x i8], [32 x i16], [32 x i16], [2 x i8], [4 x i8], [2 x [4 x i8]], [2 x [4 x i8]], i32, i32, i16, i8 } Index: llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi4.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi4.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi4.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct value for a Phi in the epilog ; that is for a value defined two stages earlier. An extra copy in the Index: llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi5.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi5.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi5.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we use the correct name in an epilog phi for a phi value ; that is defined for the last time in the kernel. Previously, we Index: llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi8.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi8.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-epilog-phi8.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mno-pairing -mno-compound -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -mno-pairing -mno-compound -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; XFAIL: * ; Test that we generate the correct phi names in the epilog when the pipeliner Index: llvm/trunk/test/CodeGen/Hexagon/swp-kernel-phi1.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-kernel-phi1.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-kernel-phi1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct names for the phis in the kernel for the ; incoming values. In this case, the loop contains a phi and has another phi Index: llvm/trunk/test/CodeGen/Hexagon/swp-large-rec.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-large-rec.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-large-rec.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=hexagon -enable-pipeliner -stats \ ; RUN: -pipeliner-prune-loop-carried=false -fp-contract=fast \ -; RUN: -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; That that we do not pipeline this loop. The recurrence is too large. If Index: llvm/trunk/test/CodeGen/Hexagon/swp-listen-loop3.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-listen-loop3.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-listen-loop3.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -pipeliner-ignore-recmii -pipeliner-max-stages=2 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -pipeliner-ignore-recmii -pipeliner-max-stages=2 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; This is a loop we pipeline to three packets, though we could do bettter. Index: llvm/trunk/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner schedules a store before the load in which there is a ; loop carried dependence. Previously, the loop carried dependence wasn't added Index: llvm/trunk/test/CodeGen/Hexagon/swp-lots-deps.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-lots-deps.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-lots-deps.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; STATS: 1 pipeliner - Number of loops software pipelined Index: llvm/trunk/test/CodeGen/Hexagon/swp-max.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-max.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-max.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \ -; RUN: -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s @A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8 Index: llvm/trunk/test/CodeGen/Hexagon/swp-maxstart.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-maxstart.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-maxstart.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -O3 < %s | FileCheck %s +; RUN: llc -march=hexagon -O3 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the MinStart computation, which is based upon the length ; of the chain edges, is computed correctly. A bug in the code allowed Index: llvm/trunk/test/CodeGen/Hexagon/swp-memrefs-epilog.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-memrefs-epilog.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-memrefs-epilog.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -O2 -fp-contract=fast < %s | FileCheck %s +; RUN: llc -march=hexagon -O2 -fp-contract=fast < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the memoperands for instructions in the epilog are updated ; correctly. Previously, the pipeliner updated the offset for the memoperands Index: llvm/trunk/test/CodeGen/Hexagon/swp-multi-loops.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-multi-loops.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-multi-loops.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; Make sure we attempt to pipeline all inner most loops. Index: llvm/trunk/test/CodeGen/Hexagon/swp-new-phi.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-new-phi.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-new-phi.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the generatePhi code doesn't rename a a Phi instruction that's defined ; in the same block. The bug causes a Phi to incorrectly depend on another Phi. Index: llvm/trunk/test/CodeGen/Hexagon/swp-order-copies.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-order-copies.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-order-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the instruction ordering code in the pipeliner fixes up dependences ; between post-increment register definitions and uses so that the register Index: llvm/trunk/test/CodeGen/Hexagon/swp-order-deps7.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-order-deps7.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-order-deps7.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner cause an assert and correctly pipelines the ; loop. Index: llvm/trunk/test/CodeGen/Hexagon/swp-order.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-order.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-order.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that when we order instructions in a packet we check for ; order dependences so that the source of an order dependence Index: llvm/trunk/test/CodeGen/Hexagon/swp-phi-ch-offset.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-phi-ch-offset.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-phi-ch-offset.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct offsets after we removed unneeded ; chain dependences between Phis and generated a better pipeline. Index: llvm/trunk/test/CodeGen/Hexagon/swp-phi-chains.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-phi-chains.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-phi-chains.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -debug-only=pipeliner < %s -o - 2>&1 | FileCheck %s +; RUN: llc -march=hexagon -debug-only=pipeliner < %s -o - 2>&1 -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; Test that there is a chain edge between two dependent Phis. Index: llvm/trunk/test/CodeGen/Hexagon/swp-phi-dep.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-phi-dep.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-phi-dep.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Check that the pipelined code uses the proper address in the ; prolog and the kernel. The bug occurs when the address computation Index: llvm/trunk/test/CodeGen/Hexagon/swp-phi-ref.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-phi-ref.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-phi-ref.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s -pipeliner-experimental-cg=true | FileCheck %s ; XFAIL: * ; This test is failing after post-ra machine sinking. Index: llvm/trunk/test/CodeGen/Hexagon/swp-pragma-disable.ii =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-pragma-disable.ii +++ llvm/trunk/test/CodeGen/Hexagon/swp-pragma-disable.ii @@ -1,5 +1,5 @@ ; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ -; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; ; Test that checks if pipeliner disabled by pragma Index: llvm/trunk/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii +++ llvm/trunk/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii @@ -1,5 +1,5 @@ ; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ -; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; ; Test that checks if the II set by pragma was taken by pipeliner. Index: llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct name for a value in a prolog block. The ; pipeliner was using an incorrect value for an instruction in the 2nd prolog Index: llvm/trunk/test/CodeGen/Hexagon/swp-rename.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-rename.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-rename.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; A test that the Phi rewrite logic is correct. Index: llvm/trunk/test/CodeGen/Hexagon/swp-resmii-1.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-resmii-1.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-resmii-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; Test that checks that we compute the correct ResMII for haar. Index: llvm/trunk/test/CodeGen/Hexagon/swp-resmii.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-resmii.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-resmii.ll @@ -1,5 +1,5 @@ ; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ -; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; ; Test that checks if the ResMII is 1. Index: llvm/trunk/test/CodeGen/Hexagon/swp-reuse-phi-6.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-reuse-phi-6.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-reuse-phi-6.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner generates correct code when attempting to reuse ; an existing phi. This test case contains a phi that references another Index: llvm/trunk/test/CodeGen/Hexagon/swp-sigma.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-sigma.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-sigma.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -O2 < %s | FileCheck %s +; RUN: llc -march=hexagon -O2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; We do not pipeline sigma yet, but the non-pipelined version ; with good scheduling is pretty fast. The compiler generates Index: llvm/trunk/test/CodeGen/Hexagon/swp-stages4.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-stages4.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-stages4.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we rename registers correctly for multiple stages when there is a ; Phi and depends upon another Phi. ; CHECK: = and ; CHECK: = and -; CHECK: = and +; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1) ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255) ; CHECK: loop0(.LBB0_[[LOOP:.]], Index: llvm/trunk/test/CodeGen/Hexagon/swp-stages5.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-stages5.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-stages5.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Very similar to swp-stages4.ll, but the pipelined schedule is a little ; different. Index: llvm/trunk/test/CodeGen/Hexagon/swp-subreg.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-subreg.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-subreg.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; We're unable to pipeline a loop with a subreg as an operand of a Phi. Index: llvm/trunk/test/CodeGen/Hexagon/swp-swap.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-swap.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-swap.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; Test that we don't pipeline, incorrectly, the swap operation. Index: llvm/trunk/test/CodeGen/Hexagon/swp-tfri.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-tfri.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-tfri.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; Check that we handle the case when a value is first defined in the loop. Index: llvm/trunk/test/CodeGen/Hexagon/swp-vect-dotprod.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-vect-dotprod.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-vect-dotprod.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s -pipeliner-experimental-cg=true | FileCheck %s ; ; Check that we pipeline a vectorized dot product in a single packet. ; Index: llvm/trunk/test/CodeGen/Hexagon/swp-vmult.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-vmult.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-vmult.ll @@ -1,5 +1,5 @@ ; REQUIRES: to-be-fixed -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; Multiply and accumulate ; CHECK: mpyi([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]]) Index: llvm/trunk/test/CodeGen/Hexagon/swp-vsum.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-vsum.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-vsum.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s --check-prefix=CHECKV60 +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=CHECKV60 ; Simple vector total. ; CHECK: loop0(.LBB0_[[LOOP:.]],