Index: include/llvm/CodeGen/MachineUnroller.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/MachineUnroller.h @@ -0,0 +1,131 @@ +//===-------- llvm/CodeGen/MachineUnroller.h - Unrolling utilities --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines loop unrolling utilities used at MI level. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINEUNROLLER_H +#define LLVM_CODEGEN_MACHINEUNROLLER_H + +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" + +namespace llvm { + +// This is a utility class for unrolling loops at MI level. +// It only unroll loops with the run-time trip count and +// with a single basic block. +// +// After unrolling, the loop structure will be the following: +// +// Original LoopPreheader +// Unrolled LoopPreheader +// Unrolled Loop +// Unrolled LoopExit +// Remainder LoopPreheader +// Remainder Loop +// Remainder LoopExit +// Original LoopExit + +struct MachineUnrollerContext { + MachineFunction *MF = nullptr; + MachineLoopInfo *MLI = nullptr; + LiveIntervals *LIS = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineUnrollerContext() {} + MachineUnrollerContext(MachineFunction *mf, MachineLoopInfo *mli, + LiveIntervals *lis, const TargetInstrInfo *tii) + : MF(mf), MLI(mli), LIS(lis), TII(tii) {} +}; + +class MachineUnroller { +protected: + MachineFunction *MF = nullptr; + MachineLoopInfo *MLI = nullptr; + LiveIntervals *LIS = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + MachineLoop *L; + MachineBasicBlock *OrigHeader; + MachineBasicBlock *OrigPreheader; + MachineBasicBlock *ULPreheader; + MachineBasicBlock *ULHeader; + MachineBasicBlock *ULExit; + MachineBasicBlock *RLPreheader; + MachineBasicBlock *RLHeader; + MachineBasicBlock *RLExit; + MachineBasicBlock *OrigLoopExit; + MachineInstr *LoopIndVar; + MachineInstr *LoopCmp; + unsigned UnrollFactor; + unsigned LC; + SmallVector LoopBBs; + SmallVector ExitBBLiveIns; + + typedef SmallDenseMap, 4> + ValueMapTy; + ValueMapTy VRMap; + DenseMap ULPhiVRMap; + void createUnrolledLoopStruct(); + void updateInstruction(MachineInstr *NewMI, bool FirstIter, + ValueMapTy &OldVRMap); + void generateUnrolledLoop(); + unsigned getMappedRegORCreate(unsigned Reg, MachineBasicBlock *BB); + void generateNewPhis(MachineBasicBlock *BB, MachineBasicBlock *BB1, + MachineBasicBlock *BB2); + void generatePhisForRLExit(); + void generatePhisForULExit(); + void getExitBBLiveIns(); + void addBBIntoVRMap(MachineBasicBlock *BB); + void fixBranchesAndLoopCount(unsigned ULCount, unsigned RLCount); + unsigned getLatestInstance(unsigned reg, MachineBasicBlock *BB, + ValueMapTy &VRMap); + void init(MachineLoop *loop, unsigned unrollFactor); + bool canUnroll(); + void preprocessPhiNodes(MachineBasicBlock &B); + +public: + MachineUnroller(MachineUnrollerContext *C) + : MF(C->MF), MLI(C->MLI), LIS(C->LIS), TII(C->TII) { + MRI = &MF->getRegInfo(); + } + + virtual ~MachineUnroller() = default; + + bool unroll(MachineLoop *loop, unsigned unrollFactor); + + virtual unsigned getLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar, + MachineInstr &Cmp) const = 0; + + /// Add instruction to compute trip count for the unrolled loop. + virtual unsigned addUnrolledLoopCountMI(MachineBasicBlock &MBB, unsigned LC, + unsigned UnrollFactor) const = 0; + + /// Add instruction to compute remainder trip count for the unrolled loop. + virtual unsigned addRemLoopCountMI(MachineBasicBlock &MBB, unsigned LC, + unsigned UnrollFactor) const = 0; + + virtual void changeLoopCount(MachineBasicBlock &BB, + MachineBasicBlock &Preheader, + MachineBasicBlock &Header, unsigned LC, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond) const = 0; + + bool computeDelta(MachineInstr &MI, unsigned &Delta) const; + void updateMemOperands(MachineInstr *NewMI, MachineInstr *OldMI, + unsigned iter) const; + virtual void optimize(MachineBasicBlock &BB) const {}; +}; +} // namespace llvm +#endif Index: include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- include/llvm/CodeGen/TargetPassConfig.h +++ include/llvm/CodeGen/TargetPassConfig.h @@ -25,6 +25,8 @@ struct MachineSchedContext; class PassConfigImpl; class ScheduleDAGInstrs; +class MachineUnroller; +struct MachineUnrollerContext; // The old pass manager infrastructure is hidden in a legacy namespace now. namespace legacy { @@ -280,11 +282,17 @@ /// return new ScheduleDAGMI(C, make_unique(C), /*RemoveKillFlags=*/false) /// /// Return NULL to select the default (generic) machine scheduler. + virtual ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const { return nullptr; } + virtual MachineUnroller * + createMachineUnroller(MachineUnrollerContext *C) const { + return nullptr; + } + /// Similar to createMachineScheduler but used when postRA machine scheduling /// is enabled. virtual ScheduleDAGInstrs * Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -92,6 +92,7 @@ MachineSink.cpp MachineSSAUpdater.cpp MachineTraceMetrics.cpp + MachineUnroller.cpp MachineVerifier.cpp PatchableFunction.cpp MIRPrinter.cpp Index: lib/CodeGen/MachinePipeliner.cpp =================================================================== --- lib/CodeGen/MachinePipeliner.cpp +++ lib/CodeGen/MachinePipeliner.cpp @@ -86,11 +86,13 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/MachineUnroller.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Config/llvm-config.h" @@ -133,6 +135,16 @@ cl::ZeroOrMore, cl::desc("Enable Software Pipelining")); +/// A command line option to turn unrolling on or off before pipeling the loop. +static cl::opt + EnableSWPUnroll("enable-pipeliner-unroll", cl::Hidden, cl::init(false), + cl::ZeroOrMore, cl::desc("Enable runtime unrolling before pipelining")); + +/// A command line argument to limit size of the unrolled loop. +static cl::opt SwpUnrollThres("pipeliner-unroll-threshold", + cl::desc("Size limit for the unrolled loop."), + cl::Hidden, cl::init(30)); + /// A command line option to enable SWP at -Os. static cl::opt EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, @@ -165,6 +177,8 @@ #ifndef NDEBUG static cl::opt SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1)); +static cl::opt SwpUnrollLimit("pipeliner-unroll-max", + cl::Hidden, cl::init(-1)); #endif static cl::opt SwpIgnoreRecMII("pipeliner-ignore-recmii", @@ -180,11 +194,13 @@ /// software pipeliner pass. class MachinePipeliner : public MachineFunctionPass { public: + const TargetPassConfig *PassConfig = nullptr; MachineFunction *MF = nullptr; const MachineLoopInfo *MLI = nullptr; const MachineDominatorTree *MDT = nullptr; const InstrItineraryData *InstrItins; const TargetInstrInfo *TII = nullptr; + MachineUnroller *Unroller = nullptr; RegisterClassInfo RegClassInfo; #ifndef NDEBUG @@ -215,6 +231,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -305,6 +322,9 @@ RegClassInfo(rci), Topo(SUnits, &ExitSU) { P.MF->getSubtarget().getSMSMutations(Mutations); } +#ifndef NDEBUG + static int NumUnrollTries; +#endif void schedule() override; void finishBlock() override; @@ -386,7 +406,7 @@ void addLoopCarriedDependences(AliasAnalysis *AA); void updatePhiDependences(); void changeDependences(); - unsigned calculateResMII(); + unsigned calculateResMII(unsigned UnrollCount = 1); unsigned calculateRecMII(NodeSetType &RecNodeSets); void findCircuits(NodeSetType &NodeSets); void fuseRecs(NodeSetType &NodeSets); @@ -422,9 +442,10 @@ MBBVectorTy &EpilogBBs); void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, SMSchedule &Schedule); + void removeBB(MachineBasicBlock *RemoveBB, MBBVectorTy &UpdateBBs); void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs, SMSchedule &Schedule, - ValueMapTy *VRMap); + MBBVectorTy &EpilogBBs, MBBVectorTy &UpdateBBs, + SMSchedule &Schedule, ValueMapTy *VRMap); bool computeDelta(MachineInstr &MI, unsigned &Delta); void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, unsigned Num); @@ -722,6 +743,7 @@ char MachinePipeliner::ID = 0; #ifndef NDEBUG int MachinePipeliner::NumTries = 0; +int SwingSchedulerDAG::NumUnrollTries = 0; #endif char &llvm::MachinePipelinerID = MachinePipeliner::ID; @@ -752,10 +774,16 @@ MDT = &getAnalysis(); TII = MF->getSubtarget().getInstrInfo(); RegClassInfo.runOnMachineFunction(*MF); - + PassConfig = &getAnalysis(); + if (EnableSWPUnroll) { + MachineUnrollerContext C(MF, &getAnalysis(), + &getAnalysis(), TII); + Unroller = PassConfig->createMachineUnroller(&C); + } for (auto &L : *MLI) scheduleLoop(*L); + delete Unroller; return false; } @@ -876,10 +904,23 @@ return SMS.hasNewSchedule(); } +static unsigned getNonDebugMBBSize(MachineBasicBlock *MBB) { + int size = 0; + for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), + E = MBB->getFirstTerminator(); + I != E; ++I) { + if (!I->isDebugValue()) + size++; + } + return size; +} + /// We override the schedule function in ScheduleDAGInstrs to implement the /// scheduling part of the Swing Modulo Scheduling algorithm. void SwingSchedulerDAG::schedule() { AliasAnalysis *AA = &Pass.getAnalysis().getAAResults(); + MachineLoopInfo *MLI = &Pass.getAnalysis(); + buildSchedGraph(AA); addLoopCarriedDependences(AA); updatePhiDependences(); @@ -896,6 +937,79 @@ unsigned ResMII = calculateResMII(); unsigned RecMII = calculateRecMII(NodeSets); + bool UnrollLimitReached = false; +#ifndef NDEBUG + // Stop unrolling after reaching the limit (if any). + int Limit = SwpUnrollLimit; + if (Limit >= 0) { + if (NumUnrollTries >= SwpUnrollLimit) + UnrollLimitReached = true; + } +#endif + + // Try to unroll the loop only if ResMII >= RecMII. + if ((ResMII >= RecMII) && EnableSWPUnroll && !UnrollLimitReached) { + unsigned MinResMII = ResMII; + unsigned MinUnrollFactor = 1; + unsigned UnrollThres = 4; + unsigned LoopHeaderSize = getNonDebugMBBSize(Loop.getHeader()); + for (unsigned i = 2; i <= UnrollThres; i+=2) { + unsigned UnrollResMII = calculateResMII(i); + LLVM_DEBUG(dbgs() << "Unroll Factor = " << i << "(res=" << UnrollResMII + << ")\n"); + float UnrollResMIIRatio = (float) UnrollResMII / i; + float MinResMIIRatio = (float) MinResMII / MinUnrollFactor; + if (UnrollResMIIRatio < MinResMIIRatio && + (LoopHeaderSize * i) <= SwpUnrollThres) { + MinResMII = UnrollResMII; + MinUnrollFactor = i; + } + } + + LLVM_DEBUG(dbgs() << "Best Unroll Factor = " << MinUnrollFactor + << "(res=" << MinResMII << ")\n"); + + bool Changed = false; + if (MinUnrollFactor > 1) + Changed = Pass.Unroller->unroll(&Loop, MinUnrollFactor); + + if (Changed) { +#ifndef NDEBUG + NumUnrollTries++; +#endif + this->MLI = MLI; + Pass.LI.TBB = nullptr; + Pass.LI.FBB = nullptr; + Pass.LI.BrCond.clear(); + if (TII->analyzeBranch(*Loop.getHeader(), Pass.LI.TBB, Pass.LI.FBB, Pass.LI.BrCond)) + return; + + Pass.LI.LoopInductionVar = nullptr; + Pass.LI.LoopCompare = nullptr; + if (TII->analyzeLoop(Loop, Pass.LI.LoopInductionVar, Pass.LI.LoopCompare)) + return; + + MachineBasicBlock *MBB = Loop.getHeader(); + startBlock(MBB); + unsigned size = MBB->size(); + enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size); + buildSchedGraph(AA); + addLoopCarriedDependences(AA); + updatePhiDependences(); + Topo.InitDAGTopologicalSorting(); + postprocessDAG(); + changeDependences(); + LLVM_DEBUG(dump()); + + NodeSets.clear(); + findCircuits(NodeSets); + + // Recalculate the MII after unrolling. + ResMII = calculateResMII(); + RecMII = calculateRecMII(NodeSets); + } + } + fuseRecs(NodeSets); // This flag is used for testing and can cause correctness problems. @@ -1367,7 +1481,7 @@ /// for each cycle that is required. When adding a new instruction, we attempt /// to add it to each existing DFA, until a legal space is found. If the /// instruction cannot be reserved in an existing DFA, we create a new one. -unsigned SwingSchedulerDAG::calculateResMII() { +unsigned SwingSchedulerDAG::calculateResMII(unsigned UnrollFactor) { SmallVector Resources; MachineBasicBlock *MBB = Loop.getHeader(); Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget())); @@ -1383,11 +1497,14 @@ PriorityQueue, FuncUnitSorter> FuncUnitOrder(FUS); - for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), - E = MBB->getFirstTerminator(); - I != E; ++I) - FuncUnitOrder.push(&*I); - + // To compute ResMII for the unrolled loop, simply replicate instructions as + // many times as the unroll factor. + for (unsigned i = 0; i < UnrollFactor; i++) { + for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), + E = MBB->getFirstTerminator(); + I != E; ++I) + FuncUnitOrder.push(&*I); + } while (!FuncUnitOrder.empty()) { MachineInstr *MI = FuncUnitOrder.top(); FuncUnitOrder.pop(); @@ -2313,6 +2430,20 @@ return scheduleFound && Schedule.getMaxStageCount() > 0; } +static void updateLiveness(SmallVector &MBBList, + LiveIntervals &LIS) { + for (auto MBB: MBBList) { + for (MachineInstr &MI : *MBB) { + if (!LIS.isNotInMIMap(MI)) + LIS.RemoveMachineInstrFromMaps(MI); + if (MI.isDebugValue()) + continue; + LIS.InsertMachineInstrInMaps(MI); + } + } +} + + /// Given a schedule for the loop, generate a new version of the loop, /// and replace the old version. This function generates a prolog /// that contains the initial iterations in the pipeline, and kernel @@ -2335,6 +2466,7 @@ // Generate the prolog instructions that set up the pipeline. generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs); MF.insert(BB->getIterator(), KernelBB); + LIS.insertMBBInMaps(KernelBB); // Rearrange the instructions to generate the new, pipelined loop, // and update register names as needed. @@ -2387,8 +2519,14 @@ // Remove dead instructions due to loop induction variables. removeDeadInstructions(KernelBB, EpilogBBs); + // Add PrologBBs, KernelBB and EpilogBBs for the liveness update later. + SmallVector UpdateBBs; + UpdateBBs.insert(UpdateBBs.begin(), PrologBBs.begin(), PrologBBs.end()); + UpdateBBs.insert(UpdateBBs.end(), KernelBB); + UpdateBBs.insert(UpdateBBs.end(), EpilogBBs.begin(), EpilogBBs.end()); + // Add branches between prolog and epilog blocks. - addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap); + addBranches(PrologBBs, KernelBB, EpilogBBs, UpdateBBs, Schedule, VRMap); // Remove the original loop since it's no longer referenced. for (auto &I : *BB) @@ -2396,6 +2534,9 @@ BB->clear(); BB->eraseFromParent(); + // Update liveness + updateLiveness(UpdateBBs, LIS); + delete[] VRMap; } @@ -2419,6 +2560,7 @@ MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock()); PrologBBs.push_back(NewBB); MF.insert(BB->getIterator(), NewBB); + LIS.insertMBBInMaps(NewBB); NewBB->transferSuccessors(PredBB); PredBB->addSuccessor(NewBB); PredBB = NewBB; @@ -2494,7 +2636,7 @@ MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(); EpilogBBs.push_back(NewBB); MF.insert(BB->getIterator(), NewBB); - + LIS.insertMBBInMaps(NewBB); PredBB->replaceSuccessor(LoopExitBB, NewBB); NewBB->addSuccessor(LoopExitBB); @@ -2567,8 +2709,6 @@ if (O.getParent()->getParent() != MBB) O.setReg(ToReg); } - if (!LIS.hasInterval(ToReg)) - LIS.createEmptyInterval(ToReg); } /// Return true if the register has a use that occurs outside the @@ -3070,12 +3210,29 @@ } } +// Remove basic block from its parent and also from UpdateBBs as +// we don't need for the liveness update any longer. +void SwingSchedulerDAG::removeBB(MachineBasicBlock *RemoveBB, + MBBVectorTy &UpdateBBs) { + for (MBBVectorTy::const_iterator MBB = UpdateBBs.begin(), + MBE = UpdateBBs.end(); + MBB != MBE; ++MBB) { + if (*MBB == RemoveBB) { + UpdateBBs.erase(MBB); + break; + } + } + RemoveBB->clear(); + RemoveBB->eraseFromParent(); +} + /// Create branches from each prolog basic block to the appropriate epilog /// block. These edges are needed if the loop ends before reaching the /// kernel. void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, + MBBVectorTy &UpdateBBs, SMSchedule &Schedule, ValueMapTy *VRMap) { assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch"); MachineInstr *IndVar = Pass.LI.LoopInductionVar; @@ -3119,12 +3276,10 @@ numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc()); removePhis(Epilog, LastEpi); // Remove the blocks that are no longer referenced. - if (LastPro != LastEpi) { - LastEpi->clear(); - LastEpi->eraseFromParent(); - } - LastPro->clear(); - LastPro->eraseFromParent(); + if (LastPro != LastEpi) + removeBB(LastEpi, UpdateBBs); + + removeBB(LastPro, UpdateBBs); } else { numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc()); removePhis(Epilog, Prolog); Index: lib/CodeGen/MachineUnroller.cpp =================================================================== --- /dev/null +++ lib/CodeGen/MachineUnroller.cpp @@ -0,0 +1,728 @@ +//===------- MachineUnroller.cpp - Machine Loop unrolling utilities -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file implements the loop unrolling functionality at MI level. +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineUnroller.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "mi-loop-unroll" + +// This is a utility class for unrolling loops at MI level. +// It only unroll loops with the run-time trip count and +// with a single basic block. +// +// After unrolling, the loop structure will be the following: +// +// Original LoopPreheader +// Unrolled LoopPreheader +// Unrolled Loop +// Unrolled LoopExit +// Remainder LoopPreheader +// Remainder Loop +// Remainder LoopExit +// Original LoopExit + +void MachineUnroller::init(MachineLoop *loop, unsigned unrollFactor) { + L = loop; + UnrollFactor = unrollFactor; + OrigHeader = L->getHeader(); + OrigPreheader = L->getLoopPreheader(); + OrigLoopExit = L->getExitBlock(); + LoopBBs.clear(); + ExitBBLiveIns.clear(); +} + +bool MachineUnroller::canUnroll() { + // Only loops with a single basic block are handled. Also, the loop must + // be analyzable using analyzeBranch. It's the responsibility of the caller of + // this function to make sure that these requirement are met. + assert(L->getNumBlocks() == 1 && "Only loops with single basic block can be" + "unrolled!!"); + if (!isPowerOf2_32(UnrollFactor)) { + LLVM_DEBUG(dbgs() << "Can't Unroll!! UnrollFactor must be a power of 2."); + return false; + } + + LoopIndVar = nullptr; + LoopCmp = nullptr; + if (TII->analyzeLoop(*L, LoopIndVar, LoopCmp)) + return false; + + // Get loop trip count. Compile-time trip count is not handled. + LC = getLoopCount(*OrigHeader, LoopIndVar, *LoopCmp); + return TargetRegisterInfo::isVirtualRegister(LC); +} + +/// Create empty basic blocks for the unrolled/remainder loops and +/// add them to the CFG. Some BBs from the original loop are reused +/// and their successors/predecessors are changed as needed. +void MachineUnroller::createUnrolledLoopStruct() { + // Create basic blocks for the Unrolled Loop. + ULPreheader = MF->CreateMachineBasicBlock(); + MF->insert(OrigHeader->getIterator(), ULPreheader); + LIS->insertMBBInMaps(ULPreheader); + + ULHeader = MF->CreateMachineBasicBlock(); + ULHeader->setAlignment(OrigHeader->getAlignment()); + MF->insert(OrigHeader->getIterator(), ULHeader); + LIS->insertMBBInMaps(ULHeader); + + ULPreheader->addSuccessor(ULHeader); + ULHeader->addSuccessor(ULHeader); + OrigPreheader->replaceSuccessor(OrigHeader, ULPreheader); + + // Create basic blocks for the Remainder Loop. The original loop header + // is used as the remainder loop header. The loop trip count is adjusted + // later to the appropriate value. + RLHeader = OrigHeader; + + ULExit = MF->CreateMachineBasicBlock(); + MF->insert(RLHeader->getIterator(), ULExit); + LIS->insertMBBInMaps(ULExit); + + RLPreheader = MF->CreateMachineBasicBlock(); + MF->insert(RLHeader->getIterator(), RLPreheader); + LIS->insertMBBInMaps(RLPreheader); + + RLExit = MF->CreateMachineBasicBlock(); + MF->insert(++RLHeader->getIterator(), RLExit); + LIS->insertMBBInMaps(RLExit); + + ULExit->addSuccessor(RLPreheader); + RLPreheader->addSuccessor(RLHeader); + + ULHeader->addSuccessor(ULExit); + OrigPreheader->addSuccessor(ULExit); + ULExit->addSuccessor(RLExit); + RLExit->addSuccessor(OrigLoopExit); + RLHeader->replaceSuccessor(OrigLoopExit, RLExit); + + LoopBBs.push_back(ULPreheader); + LoopBBs.push_back(ULHeader); + LoopBBs.push_back(ULExit); + LoopBBs.push_back(RLPreheader); + LoopBBs.push_back(RLHeader); + LoopBBs.push_back(RLExit); + + // Since the instructions are added/deleted to the basic blocks present + // in LoopBBs and OrigPreheader, it makes their slot indexes out-of-date. + // Remove all the instructions currently present in these basic blocks from + // LIS and insert them later after they have gone through all changes. + for (auto MBB : LoopBBs) { + for (MachineInstr &MI : *MBB) + if (!LIS->isNotInMIMap(MI)) + LIS->RemoveMachineInstrFromMaps(MI); + } + + for (MachineInstr &MI : *OrigPreheader) + if (!LIS->isNotInMIMap(MI)) + LIS->RemoveMachineInstrFromMaps(MI); + + // Update the Phis in RLHeader (same as OrigHeader) and + // OrigLoopExit to use the new predecessors. + for (MachineBasicBlock::iterator I = RLHeader->instr_begin(), + E = RLHeader->getFirstNonPHI(); + I != E; ++I) { + MachineInstr *Phi = &*I; + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() != RLHeader) + Phi->getOperand(i + 1).setMBB(RLPreheader); + } + + for (MachineBasicBlock::iterator I = OrigLoopExit->instr_begin(), + E = OrigLoopExit->getFirstNonPHI(); + I != E; ++I) { + MachineInstr *Phi = &*I; + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == RLHeader) + Phi->getOperand(i + 1).setMBB(RLExit); + } +} + +/// Return the Phi Operand that comes from outside the loop. +static MachineOperand &getInitPhiOp(MachineInstr *Phi, + MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() != LoopBB) + return Phi->getOperand(i); + llvm_unreachable("Unexpected Phi structure."); +} + +/// Return the Phi register value that comes from outside the loop. +static unsigned getInitPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() != LoopBB) + return Phi->getOperand(i).getReg(); + llvm_unreachable("Unexpected Phi structure."); +} + +/// Return the Phi Operand that comes from the loop block. +static MachineOperand &getLoopPhiOp(MachineInstr *Phi, + MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == LoopBB) + return Phi->getOperand(i); + llvm_unreachable("Unexpected Phi structure."); +} + +/// Return the Phi register value that comes from the loop block. +static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == LoopBB) + return Phi->getOperand(i).getReg(); + llvm_unreachable("Unexpected Phi structure."); +} + +/// Return the basic block corresponding to the Phi register value. +static MachineBasicBlock *getPhiRegBB(MachineInstr *Phi, unsigned Reg) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i).getReg() == Reg) + return Phi->getOperand(i + 1).getMBB(); + return 0; +} + +/// Replace all uses of FromReg that appear within the specified +/// basic block with ToReg. +static void replaceRegUses(unsigned FromReg, unsigned ToReg, + MachineBasicBlock *MBB, MachineRegisterInfo &MRI) { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg), + E = MRI.use_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + MachineInstr *UseMI = O.getParent(); + if (UseMI->isPHI() && getPhiRegBB(UseMI, FromReg) != MBB) + continue; // Don't change the register name + + if (UseMI->getParent() == MBB) + O.setReg(ToReg); + } +} + +/// Clone the Phi instruction and set all the operands appropriately. +/// This function assumes the instruction is a Phi. +static MachineInstr *clonePHI(MachineBasicBlock *BB, MachineBasicBlock *BB1, + MachineBasicBlock *OrigBB, MachineInstr *Phi) { + MachineFunction *MF = OrigBB->getParent(); + unsigned InitVal = getInitPhiReg(Phi, OrigBB); + unsigned LoopVal = getLoopPhiReg(Phi, OrigBB); + MachineInstr *NewMI = MF->CloneMachineInstr(Phi); + NewMI->getOperand(1).setReg(InitVal); + NewMI->getOperand(2).setMBB(BB1); + NewMI->getOperand(3).setReg(LoopVal); + NewMI->getOperand(4).setMBB(BB); + return NewMI; +} + +static bool isBlockOutsideLoop(SmallVector &LoopBBs, + MachineBasicBlock *MBB) { + for (auto TBB : LoopBBs) + if (TBB == MBB) + return false; + return true; +} + +static void +replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg, + MachineRegisterInfo &MRI, + SmallVector &LoopBBs) { + MachineInstr *DefMI = MRI.getVRegDef(ToReg); + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg), + E = MRI.use_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + MachineBasicBlock *UseBB = O.getParent()->getParent(); + if (isBlockOutsideLoop(LoopBBs, UseBB) && DefMI != O.getParent()) + O.setReg(ToReg); + } +} + +/// Update liveness information for all the basic blocks that are either +/// newly added or modified during the transformation. +static void updateLiveness(SmallVector &MBBList, + LiveIntervals *LIS) { + for (auto MBB : MBBList) { + for (MachineInstr &MI : *MBB) { + if (!LIS->isNotInMIMap(MI)) + LIS->RemoveMachineInstrFromMaps(MI); + if (MI.isDebugValue()) + continue; + LIS->InsertMachineInstrInMaps(MI); + } + } +} + +/// Return the register name for the latest instance of 'reg' as found +/// in the VRMap. FYI, During unrolling, different instances of 'reg' +/// (one from each iteration) are given a new name which is tracked +/// using VRMap. +unsigned MachineUnroller::getLatestInstance(unsigned reg, MachineBasicBlock *BB, + ValueMapTy &VRMap) { + unsigned LatestReg = reg; + while (VRMap[BB].count(LatestReg) && LatestReg != VRMap[BB][LatestReg]) { + LatestReg = VRMap[BB][LatestReg]; + } + return LatestReg; +} + +/// Update the machine instruction with new virtual registers. This +/// function is only used to update the instructions in the unrolled +/// loop header. It may change the defintions and/or uses. +void MachineUnroller::updateInstruction(MachineInstr *NewMI, bool FirstIter, + ValueMapTy &OldVRMap) { + MachineBasicBlock *BB = NewMI->getParent(); + DenseMap NewVRMap; + DenseMap &BBVRMap = VRMap[BB]; + for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = NewMI->getOperand(i); + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + unsigned reg = MO.getReg(); + if (MO.isDef()) { + // Create a new virtual register for the definition. + const TargetRegisterClass *RC = MRI->getRegClass(reg); + unsigned NewReg = MRI->createVirtualRegister(RC); + MO.setReg(NewReg); + NewVRMap[reg] = NewReg; + if (NewMI->isPHI()) + ULPhiVRMap[reg] = NewReg; + } else if (MO.isUse()) { + MachineInstr *DefMI = MRI->getVRegDef(reg); + if (DefMI && DefMI->isPHI()) { + if (NewMI->isPHI() && FirstIter) + // Don't change the 'use' yet based on the new def reg. It will be + // changed later to use the the last instance of the value reaching + // from the loop after it has been unrolled. + continue; + else if (!FirstIter) { + // Get mapped reg: + // 1) If 'use' is a PHI, use the mapped reg from the previous + // iteration. + // 2) If 'use' is a non-PHI, use the mapped reg from the current + // iteration. + unsigned LatestReg = NewMI->isPHI() + ? getLatestInstance(reg, BB, OldVRMap) + : getLatestInstance(reg, BB, VRMap); + MO.setReg(LatestReg); + continue; + } + } + if (BBVRMap.count(reg)) { + unsigned MappedReg = BBVRMap[reg]; + if (MRI->getVRegDef(MappedReg) != NewMI) + MO.setReg(MappedReg); + } + } + } + + for (auto Val : NewVRMap) + VRMap[BB][Val.first] = Val.second; +} + +/// Return true if we can compute the amount the instruction changes +/// during each iteration. Set Delta to the amount of the change. +bool MachineUnroller::computeDelta(MachineInstr &MI, unsigned &Delta) const { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned BaseReg; + int64_t Offset; + if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) + return false; + + // Check if there is a Phi. If so, get the definition in the loop. + MachineInstr *BaseDef = MRI->getVRegDef(BaseReg); + if (BaseDef && BaseDef->isPHI()) { + if (BaseDef->getParent() != MI.getParent()) + return false; + BaseReg = getLoopPhiReg(BaseDef, MI.getParent()); + BaseDef = MRI->getVRegDef(BaseReg); + } + if (!BaseDef) + return false; + + int D = 0; + if (!TII->getIncrementValue(*BaseDef, D) && D >= 0) + return false; + + Delta = D; + return true; +} + +/// Update the memory operand with a new offset when the unroller +/// generates a new copy of the instruction that refers to a +/// different memory location. +void MachineUnroller::updateMemOperands(MachineInstr *NewMI, + MachineInstr *OldMI, unsigned iter) + const { + if (iter == 0) + return; + // If the instruction has memory operands, then adjust the offset + // when the instruction appears in different iterations. + if (NewMI->memoperands_empty()) + return; + SmallVector NewMMOs; + for (MachineMemOperand *MMO : NewMI->memoperands()) { + if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) || + (!MMO->getValue())) { + NewMMOs.push_back(MMO); + continue; + } + unsigned Delta; + if (computeDelta(*OldMI, Delta)) { + int64_t AdjOffset = Delta * iter; + NewMMOs.push_back( + MF->getMachineMemOperand(MMO, AdjOffset, MMO->getSize())); + } else + NewMMOs.push_back( + MF->getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize)); + } + NewMI->setMemRefs(*MF, NewMMOs); +} + +/// Adjust offset value for the instructions with memory operands when their +/// copies are generated after first iteration. By adjusting the offset and +/// using the right base register, we can avoid uncessary 'add' instructions +/// that are used to increment the offset for each iteration. + +/// Generate instructions for the unrolled loop header. +void MachineUnroller::generateUnrolledLoop() { + for (unsigned iter = 0; iter < UnrollFactor; iter++) { + ValueMapTy OldVRMap = VRMap; + for (MachineBasicBlock::iterator I = OrigHeader->instr_begin(), + E = OrigHeader->getFirstTerminator(); + I != E; ++I) { + MachineInstr *MI = &*I; + bool FirstIter = (iter == 0); + if (MI->isPHI() && !FirstIter) { + // Just create a new dummy register name for the PHI def and map + // it to LoopVal reaching from the previous iteration. + unsigned OrigReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(OrigReg); + unsigned NewReg = MRI->createVirtualRegister(RC); + VRMap[ULHeader][OrigReg] = NewReg; + unsigned LoopVal = getLoopPhiReg(MI, OrigHeader); + VRMap[ULHeader][NewReg] = + getLatestInstance(LoopVal, ULHeader, OldVRMap); + continue; + } + MachineInstr *NewMI = + MI->isPHI() ? clonePHI(ULHeader, ULPreheader, OrigHeader, MI) + : MF->CloneMachineInstr(MI); + ULHeader->push_back(NewMI); + updateInstruction(NewMI, iter == 0, OldVRMap); + updateMemOperands(NewMI, MI, iter); + } + } + + // Copy any terminator instructions to the unrolled loop header. + for (MachineBasicBlock::iterator I = OrigHeader->getFirstTerminator(), + E = OrigHeader->instr_end(); + I != E; ++I) { + MachineInstr *NewMI = MF->CloneMachineInstr(&*I); + ULHeader->push_back(NewMI); + updateInstruction(NewMI, false, VRMap); + } + + // Update PHIs + for (MachineBasicBlock::iterator I = ULHeader->instr_begin(), + E = ULHeader->getFirstNonPHI(); + I != E; ++I) { + MachineInstr *Phi = &*I; + MachineOperand &MO = getLoopPhiOp(Phi, ULHeader); + unsigned reg = MO.getReg(); + MO.setReg(getLatestInstance(reg, ULHeader, VRMap)); + } +} + +/// Regenerate post-increment load/store instructions. Also, update the offset +/// value for the load/store instructions that use the same base address as the +/// newly created post-increment load/store. + +/// Generate Phis for the exit block for the unrolled loop. +void MachineUnroller::generatePhisForULExit() { + ValueMapTy OldVRMap = VRMap; + for (MachineBasicBlock::iterator I = OrigHeader->instr_begin(), + E = OrigHeader->getFirstNonPHI(); + I != E; ++I) { + MachineInstr *Phi = &*I; + assert(Phi->isPHI() && "Expecting a Phi."); + unsigned DefReg = Phi->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DefReg); + unsigned InitVal = getInitPhiReg(Phi, OrigHeader); + unsigned LoopVal = getLoopPhiReg(Phi, OrigHeader); + + assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure."); + MachineInstr *LoopInst = MRI->getVRegDef(LoopVal); + unsigned PhiOp1 = InitVal; + unsigned PhiOp2 = LoopInst->isPHI() + ? getLatestInstance(LoopVal, ULHeader, OldVRMap) + : getLatestInstance(LoopVal, ULHeader, VRMap); + + unsigned NewReg = MRI->createVirtualRegister(RC); + MachineInstrBuilder NewPhi = + BuildMI(*ULExit, ULExit->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + NewPhi.addReg(PhiOp1).addMBB(OrigPreheader); + NewPhi.addReg(PhiOp2).addMBB(ULHeader); + VRMap[ULExit][DefReg] = NewReg; + replaceRegUses(DefReg, NewReg, ULExit, *MRI); + + // Update Phi in the original loop header to use 'NewReg' + // as the initial value. + getInitPhiOp(Phi, OrigHeader).setReg(NewReg); + } + + // Generate additional PHIs for the values that are live-in for + // the original loop exit block. + generateNewPhis(ULExit, OrigPreheader, ULHeader); +} + +unsigned MachineUnroller::getMappedRegORCreate(unsigned Reg, + MachineBasicBlock *BB) { + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + if (VRMap[BB].count(Reg)) + return getLatestInstance(Reg, BB, VRMap); + + unsigned NewReg = MRI->createVirtualRegister(RC); + BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), NewReg); + return NewReg; +} + +void MachineUnroller::generateNewPhis(MachineBasicBlock *BB, + MachineBasicBlock *BB1, + MachineBasicBlock *BB2) { + for (auto Reg : ExitBBLiveIns) { + unsigned BB1Reg = getMappedRegORCreate(Reg, BB1); + unsigned BB2Reg = getMappedRegORCreate(Reg, BB2); + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + unsigned NewReg = MRI->createVirtualRegister(RC); + MachineInstrBuilder NewPhi = BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + NewPhi.addReg(BB1Reg).addMBB(BB1); + NewPhi.addReg(BB2Reg).addMBB(BB2); + VRMap[BB][Reg] = NewReg; + } +} + +/// Generate Phis for the exit block for the remainder loop. +void MachineUnroller::generatePhisForRLExit() { + // Generate PHIs for the values that are live-in for + // the original loop exit block. + generateNewPhis(RLExit, ULExit, RLHeader); + + for (MachineBasicBlock::iterator I = RLExit->instr_begin(), + E = RLExit->getFirstNonPHI(); + I != E; ++I) { + MachineInstr *Phi = &*I; + unsigned OrigBBReg = 0; + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) { + if (Phi->getOperand(i + 1).getMBB() == OrigHeader) + OrigBBReg = Phi->getOperand(i).getReg(); + } + assert(OrigBBReg != 0 && "Unexpected Phi structure."); + unsigned PhiDefReg = Phi->getOperand(0).getReg(); + replaceRegUsesAfterLoop(OrigBBReg, PhiDefReg, *MRI, LoopBBs); + } +} + +void MachineUnroller::getExitBBLiveIns() { + for (auto I = OrigHeader->instr_begin(), E = OrigHeader->instr_end(); I != E; + ++I) { + MachineInstr *MI = &*I; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + unsigned DefReg = MO.getReg(); + for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DefReg), + E = MRI->use_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + if (O.getParent()->getParent() != OrigHeader) { + ExitBBLiveIns.push_back(DefReg); + break; + } + } + } + } +} + +void MachineUnroller::addBBIntoVRMap(MachineBasicBlock *BB) { + for (auto I = BB->instr_begin(), E = BB->instr_end(); I != E; ++I) { + MachineInstr *MI = &*I; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + if (MO.isDef()) { + unsigned DefReg = MO.getReg(); + VRMap[BB][DefReg] = DefReg; + } + } + } +} + +/// Remove all Phi instructions from BB. +static void cleanUpPHIs(MachineBasicBlock *BB, MachineRegisterInfo &MRI) { + for (MachineBasicBlock::iterator MII = BB->instr_begin(), + MIE = BB->getFirstNonPHI(); + MII != MIE;) { + MachineInstr *Phi = &*MII; + ++MII; + unsigned InitVal = getInitPhiReg(Phi, BB); + unsigned PhiDef = Phi->getOperand(0).getReg(); + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(PhiDef), + E = MRI.use_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + O.setReg(InitVal); + } + Phi->eraseFromParent(); + } +} + +/// Fix all the branches for the unrolled and remainder loops. Also, update +/// the loop count. +void MachineUnroller::fixBranchesAndLoopCount(unsigned ULCount, + unsigned RLCount) { + SmallVector Cond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + bool checkBranch = TII->analyzeBranch(*ULHeader, TBB, FBB, Cond); + assert(!checkBranch && "Can't analyze the branch in UnrolledLoop Header"); + (void)checkBranch; + + TII->removeBranch(*ULHeader); + TII->insertBranch(*ULHeader, ULHeader, ULExit, Cond, DebugLoc()); + + // Change loop count for the Unrolled loop and fixup branches. + SmallVector Cond1; + changeLoopCount(*OrigPreheader, *ULPreheader, *ULHeader, ULCount, LoopIndVar, + *LoopCmp, Cond1); + TII->insertBranch(*OrigPreheader, ULExit, ULPreheader, Cond1, DebugLoc()); + Cond1.clear(); + TII->insertBranch(*ULPreheader, ULHeader, nullptr, Cond1, DebugLoc()); + + // Copy instructions from the unrolled loop preheader as it may contain + // loop setup instructions also needed for the Remainder loop. + for (MachineBasicBlock::iterator I = ULPreheader->instr_begin(), + E = ULPreheader->getFirstTerminator(); + I != E; ++I) { + MachineInstr *MI = &*I; + MachineInstr *NewMI = MF->CloneMachineInstr(MI); + ULExit->push_back(NewMI); + } + + // Change loop count for the Remainder loop and fixup branches. + TII->removeBranch(*RLHeader); + TII->insertBranch(*RLHeader, RLHeader, RLExit, Cond, DebugLoc()); + + Cond1.clear(); + changeLoopCount(*ULExit, *RLPreheader, *RLHeader, RLCount, LoopIndVar, + *LoopCmp, Cond1); + TII->insertBranch(*ULExit, RLExit, RLPreheader, Cond1, DebugLoc()); + + Cond1.clear(); + TII->insertBranch(*RLPreheader, RLHeader, nullptr, Cond1, DebugLoc()); + TII->insertBranch(*RLExit, OrigLoopExit, nullptr, Cond1, DebugLoc()); + if (RLHeader->succ_size() == 1) + cleanUpPHIs(RLHeader, *MRI); +} + +void MachineUnroller::preprocessPhiNodes(MachineBasicBlock &B) { + SlotIndexes &Slots = *LIS->getSlotIndexes(); + + for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) { + MachineOperand &DefOp = PI.getOperand(0); + assert(DefOp.getSubReg() == 0); + auto *RC = MRI->getRegClass(DefOp.getReg()); + + for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) { + MachineOperand &RegOp = PI.getOperand(i); + if (RegOp.getSubReg() == 0) + continue; + + // If the operand uses a subregister, replace it with a new register + // without subregisters, and generate a copy to the new register. + unsigned NewReg = MRI->createVirtualRegister(RC); + MachineBasicBlock &PredB = *PI.getOperand(i + 1).getMBB(); + MachineBasicBlock::iterator At = PredB.getFirstTerminator(); + const DebugLoc &DL = PredB.findDebugLoc(At); + auto Copy = + BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg) + .addReg(RegOp.getReg(), getRegState(RegOp), RegOp.getSubReg()); + Slots.insertMachineInstrInMaps(*Copy); + RegOp.setReg(NewReg); + RegOp.setSubReg(0); + } + } +} + +bool MachineUnroller::unroll(MachineLoop *loop, unsigned unrollFactor) { + init(loop, unrollFactor); + if (!canUnroll()) + return false; + + // Remove any subregisters from input to phi nodes. + preprocessPhiNodes(*loop->getHeader()); + + // Add all the def regs in the loop header in VRMap. + addBBIntoVRMap(OrigHeader); + getExitBBLiveIns(); + + // Create empty basic blocks for the unrolled version of the loop. + createUnrolledLoopStruct(); + + // Add instructions to compute trip counts for the unrolled and + // remainder loops. + TII->removeBranch(*OrigPreheader); + unsigned ULCount = addUnrolledLoopCountMI(*OrigPreheader, LC, UnrollFactor); + unsigned RLCount = addRemLoopCountMI(*OrigPreheader, LC, UnrollFactor); + + // Add instructions to the Unrolled loop header. + generateUnrolledLoop(); + + // Generate Phis for the unrolled loop exit block and also update + // Phis in the remainder loop header to use the correct initial values. + generatePhisForULExit(); + + // Generate Phis for the remainder loop exit block. + generatePhisForRLExit(); + + // Optimize unrolled loop header. + optimize(*ULHeader); + + // Update branches and adjust loop count. + fixBranchesAndLoopCount(ULCount, RLCount); + + SmallVector UpdateBBs = LoopBBs; + UpdateBBs.insert(UpdateBBs.begin(), OrigPreheader); + updateLiveness(UpdateBBs, LIS); + + // Modify existing loop to point to the unrolled loop header. + L->removeBlockFromLoop(OrigHeader); + L->addBasicBlockToLoop(ULHeader, MLI->getBase()); + return true; +} Index: lib/Target/Hexagon/CMakeLists.txt =================================================================== --- lib/Target/Hexagon/CMakeLists.txt +++ lib/Target/Hexagon/CMakeLists.txt @@ -43,6 +43,7 @@ HexagonLoopIdiomRecognition.cpp HexagonMachineFunctionInfo.cpp HexagonMachineScheduler.cpp + HexagonMachineUnroller.cpp HexagonMCInstLower.cpp HexagonNewValueJump.cpp HexagonOptAddrMode.cpp Index: lib/Target/Hexagon/Hexagon.td =================================================================== --- lib/Target/Hexagon/Hexagon.td +++ lib/Target/Hexagon/Hexagon.td @@ -252,7 +252,7 @@ let ValueCols = [["BaseLongOffset"]]; } -def changeAddrMode_ur_rr : InstrMapping { +def changeAddrMode_ur_rr: InstrMapping { let FilterClass = "ImmRegShl"; let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"]; let ColFields = ["addrMode"]; Index: lib/Target/Hexagon/HexagonDepInstrInfo.td =================================================================== --- lib/Target/Hexagon/HexagonDepInstrInfo.td +++ lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -10026,6 +10026,7 @@ let addrMode = PostInc; let accessSize = ByteAccess; let mayLoad = 1; +let CextOpcode = "L2_loadrb"; let BaseOpcode = "L2_loadrb_pi"; let Constraints = "$Rx32 = $Rx32in"; } @@ -10394,6 +10395,7 @@ let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; +let CextOpcode = "L2_loadrh"; let BaseOpcode = "L2_loadrh_pi"; let Constraints = "$Rx32 = $Rx32in"; } @@ -10590,6 +10592,7 @@ let addrMode = PostInc; let accessSize = WordAccess; let mayLoad = 1; +let CextOpcode = "L2_loadri"; let BaseOpcode = "L2_loadri_pi"; let Constraints = "$Rx32 = $Rx32in"; } @@ -10786,6 +10789,7 @@ let addrMode = PostInc; let accessSize = ByteAccess; let mayLoad = 1; +let CextOpcode = "L2_loadrub"; let BaseOpcode = "L2_loadrub_pi"; let Constraints = "$Rx32 = $Rx32in"; } @@ -10982,6 +10986,7 @@ let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; +let CextOpcode = "L2_loadruh"; let BaseOpcode = "L2_loadruh_pi"; let Constraints = "$Rx32 = $Rx32in"; } @@ -20655,6 +20660,7 @@ let isNewValue = 1; let isRestrictNoSlot1Store = 1; let mayStore = 1; +let CextOpcode = "S2_storerb"; let BaseOpcode = "S2_storerb_pi"; let isPredicable = 1; let isNVStorable = 1; @@ -21151,6 +21157,7 @@ let isNewValue = 1; let isRestrictNoSlot1Store = 1; let mayStore = 1; +let CextOpcode = "S2_storerh"; let BaseOpcode = "S2_storerh_pi"; let isNVStorable = 1; let isPredicable = 1; @@ -21423,6 +21430,7 @@ let isNewValue = 1; let isRestrictNoSlot1Store = 1; let mayStore = 1; +let CextOpcode = "S2_storeri"; let BaseOpcode = "S2_storeri_pi"; let isPredicable = 1; let opNewValue = 3; Index: lib/Target/Hexagon/HexagonMachineUnroller.h =================================================================== --- /dev/null +++ lib/Target/Hexagon/HexagonMachineUnroller.h @@ -0,0 +1,63 @@ +//===------ HexagonMachineUnroller.h - Custom Hexagon Machine Unroller-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Custom Hexagon Machine Unroller +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEUNROLLER_H +#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEUNROLLER_H + +#include "HexagonInstrInfo.h" +#include "llvm/CodeGen/MachineUnroller.h" + +namespace llvm { + +class HexagonMachineUnroller : public MachineUnroller { + const HexagonInstrInfo *HII; + +public: + HexagonMachineUnroller(MachineUnrollerContext *C) : MachineUnroller(C) { + HII = static_cast(C->TII); + } + + unsigned getLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar, + MachineInstr &Cmp) const override; + + /// Add instruction to compute trip count for the unrolled loop. + unsigned addUnrolledLoopCountMI(MachineBasicBlock &MBB, unsigned LC, + unsigned UnrollFactor) const override; + + /// Add instruction to compute remainder trip count for the unrolled loop. + unsigned addRemLoopCountMI(MachineBasicBlock &MBB, unsigned LC, + unsigned UnrollFactor) const override; + + void changeLoopCount(MachineBasicBlock &BB, MachineBasicBlock &Preheader, + MachineBasicBlock &Header, unsigned LC, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond) const override; + + void optimize(MachineBasicBlock &BB) const override; + + bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + void generatePostInc(MachineBasicBlock *BB) const; + void replacePostIncWithBaseOffset(MachineBasicBlock *BB) const; + void replacePostIncWithBaseOffset(MachineInstr *MI) const; + bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const; + void updateBaseAndOffset(MachineInstr *MI, MachineInstr *AddMI) const; + void foldAdds(MachineBasicBlock &BB) const; + // Remove dead instructions that might have been addeded during unrolling. + void removeDeadInstructions(MachineBasicBlock &BB) const; + bool isValidOffset(const MachineInstr &MI, int64_t Offset, + const TargetRegisterInfo *TRI) const; +}; +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEUNROLLER_H Index: lib/Target/Hexagon/HexagonMachineUnroller.cpp =================================================================== --- /dev/null +++ lib/Target/Hexagon/HexagonMachineUnroller.cpp @@ -0,0 +1,471 @@ +//===----- HexagonMachineUnroller.cpp - Custom Hexagon Machine Unroller ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Custom Hexagon Machine Unroller +// +//===----------------------------------------------------------------------===// + +#include "HexagonMachineUnroller.h" +#include "HexagonInstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineUnroller.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +static bool isAddWithImmValue(const MachineInstr &MI) { + return MI.getOpcode() == Hexagon::A2_addi; +} + +/// Return true if MIA dominates MIB. +static bool dominates(MachineInstr *MIA, MachineInstr *MIB) { + if (MIA->getParent() != MIB->getParent()) + return false; // Don't know since machine dominator tree is out of date. + + MachineBasicBlock *MBB = MIA->getParent(); + MachineBasicBlock::iterator I = MBB->instr_begin(); + // Iterate over the basic block until MIA or MIB is found. + for (; &*I != MIA && &*I != MIB; ++I) + ; + + // MIA dominates MIB if MIA is found first. + return &*I == MIA; +} + +/// Return the Phi register value that comes from the loop block. +static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == LoopBB) + return Phi->getOperand(i).getReg(); + llvm_unreachable("Unexpected Phi structure."); +} + +static bool executesAtMostOnce(MachineInstr *MI) { + if (MI->getOpcode() != Hexagon::A2_andir) + return false; + if (MI->getOperand(2).getImm() == 1) + return true; + return false; +} + +unsigned HexagonMachineUnroller::getLoopCount(MachineBasicBlock &MBB, + MachineInstr *IndVar, + MachineInstr &Cmp) const { + // We expect a hardware loop currently. This means that IndVar is set + // to null, and the compare is the ENDLOOP instruction. + assert((!IndVar) && HII->isEndLoopN(Cmp.getOpcode()) && + "Expecting a hardware loop"); + DebugLoc DL = Cmp.getDebugLoc(); + SmallPtrSet VisitedBBs; + MachineInstr *Loop = HII->findLoopInstr( + &MBB, Cmp.getOpcode(), Cmp.getOperand(0).getMBB(), VisitedBBs); + if (!Loop) + return 0; + // The loop trip count is a compile-time value. + if (Loop->getOpcode() == Hexagon::J2_loop0i || + Loop->getOpcode() == Hexagon::J2_loop1i) + return Loop->getOperand(1).getImm(); + + // The loop trip count is a run-time value. + assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction"); + return Loop->getOperand(1).getReg(); +} + +unsigned HexagonMachineUnroller::addUnrolledLoopCountMI( + MachineBasicBlock &MBB, unsigned LC, unsigned UnrollFactor) const { + assert(isPowerOf2_32(UnrollFactor) && "UnrollFactor must be a power of 2"); + MachineFunction *MF = MBB.getParent(); + unsigned ShiftBy = Log2_32(UnrollFactor); + unsigned NewUnrolledLC = HII->createVR(MF, MVT::i32); + BuildMI(MBB, MBB.instr_end(), DebugLoc(), HII->get(Hexagon::S2_lsr_i_r), + NewUnrolledLC) + .addReg(LC) + .addImm(ShiftBy); + return NewUnrolledLC; +} + +unsigned +HexagonMachineUnroller::addRemLoopCountMI(MachineBasicBlock &MBB, unsigned LC, + unsigned UnrollFactor) const { + assert(isPowerOf2_32(UnrollFactor) && "UnrollFactor must be a power of 2"); + MachineFunction *MF = MBB.getParent(); + unsigned RemLC = HII->createVR(MF, MVT::i32); + BuildMI(MBB, MBB.instr_end(), DebugLoc(), HII->get(Hexagon::A2_andir), RemLC) + .addReg(LC) + .addImm(UnrollFactor - 1); + return RemLC; +} + +/// For instructions with a base and offset, return true if the new Offset +/// is a valid value with the correct alignment. +bool HexagonMachineUnroller::isValidOffset( + const MachineInstr &MI, int64_t Offset, + const TargetRegisterInfo *TRI) const { + if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false)) + return false; + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + return (Offset & AlignMask) == 0; +} + +void HexagonMachineUnroller::changeLoopCount( + MachineBasicBlock &BB, MachineBasicBlock &Preheader, + MachineBasicBlock &Header, unsigned LC, MachineInstr *IndVar, + MachineInstr &Cmp, SmallVectorImpl &Cond) const { + + // We expect a hardware loop currently. This means that IndVar is set + // to null, and the compare is the ENDLOOP instruction. + assert((!IndVar) && HII->isEndLoopN(Cmp.getOpcode()) && + "Expecting a hardware loop"); + MachineFunction *MF = Preheader.getParent(); + DebugLoc DL = Cmp.getDebugLoc(); + SmallPtrSet VisitedBBs; + MachineInstr *Loop = HII->findLoopInstr( + &Header, Cmp.getOpcode(), Cmp.getOperand(0).getMBB(), VisitedBBs); + if (!Loop) + return; + // The loop trip count is a run-time value. + assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction"); + MachineRegisterInfo &MRI = Cmp.getParent()->getParent()->getRegInfo(); + MachineInstr *LCDefMI = MRI.getVRegDef(LC); + MachineInstr *NewCmp; + if (executesAtMostOnce(LCDefMI)) { + // The loop executes at most once. Therefore, it must be unrolled + // by removing loop setup, endloop and back-edge (jump) instruction to avoid + // stalls due to front-end mispredictions. + // FYI: the front end predicts endloop is taken twice and then waits to see + // which way it goes when it encounters it a third time. Since loop[01] is + // resolved by the back-end and it takes at least 10 cycles from fetch to + // commit, for the very small loops that execute only once, it can result + // into a lot of stalled cycles. + unsigned LoopEnd = HII->createVR(MF, MVT::i1); + NewCmp = BuildMI(&BB, DL, HII->get(Hexagon::C2_cmpgtui), LoopEnd) + .addReg(LC) + .addImm(0); + Cmp.eraseFromParent(); + Header.removeSuccessor(&Header); + } else { + unsigned LoopEnd = HII->createVR(MF, MVT::i1); + NewCmp = BuildMI(&BB, DL, HII->get(Hexagon::C2_cmpgtui), LoopEnd) + .addReg(LC) + .addImm(0); + BuildMI(&Preheader, DL, HII->get(Hexagon::J2_loop0r)) + .addMBB(Loop->getOperand(0).getMBB()) + .addReg(LC); + } + // Delete the old loop instruction. + Loop->eraseFromParent(); + Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf)); + Cond.push_back(NewCmp->getOperand(0)); +} + +bool HexagonMachineUnroller::isValidPostIncValue(const MachineInstr &MI, + int IncVal) const { + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + if ((IncVal & AlignMask) != 0) + return false; + // Number of total bits in the instruction used to encode Inc value. + unsigned IncBits = 4; + IncBits += Log2_32(HII->getMemAccessSize(MI)); + int MinValidVal = -1U << (IncBits - 1); + int MaxValidVal = ~(-1U << (IncBits - 1)); + return (IncVal >= MinValidVal && IncVal <= MaxValidVal); +} + +void HexagonMachineUnroller::foldAdds(MachineBasicBlock &BB) const { + for (MachineBasicBlock::iterator I = BB.getFirstNonPHI(), + E = BB.getFirstTerminator(); + I != E;) { + MachineInstr *MI = &*I; + I++; + if (!isAddWithImmValue(*MI)) + continue; + unsigned DefReg = MI->getOperand(0).getReg(); + unsigned AddReg = MI->getOperand(1).getReg(); + int64_t AddImm = MI->getOperand(2).getImm(); + + SmallVector UseList; + for (MachineRegisterInfo::use_iterator RI = MRI->use_begin(DefReg), + RE = MRI->use_end(); + RI != RE; ++RI) { + MachineOperand &MO = *RI; + MachineInstr *UseMI = MO.getParent(); + UseList.push_back(UseMI); + } + for (auto UseMI : UseList) { + if (isAddWithImmValue(*UseMI)) { + int64_t NewImm = AddImm + UseMI->getOperand(2).getImm(); + UseMI->getOperand(1).setReg(AddReg); + UseMI->getOperand(2).setImm(NewImm); + } else if (HII->isBaseImmOffset(*UseMI)) + updateBaseAndOffset(UseMI, MI); + } + } + removeDeadInstructions(BB); +} + +void HexagonMachineUnroller::updateBaseAndOffset(MachineInstr *MI, + MachineInstr *AddMI) const { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + assert(HII->isBaseImmOffset(*MI)); + unsigned BasePos, OffsetPos; + if (!HII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) + return; + + MachineOperand &OffsetOp = MI->getOperand(OffsetPos); + MachineOperand &BaseOp = MI->getOperand(BasePos); + + if (BaseOp.getReg() != AddMI->getOperand(0).getReg()) + return; + + unsigned IncBase = AddMI->getOperand(1).getReg(); + int64_t IncValue = AddMI->getOperand(2).getImm(); + + int64_t NewOffset = OffsetOp.getImm() + IncValue; + if (!isValidOffset(*MI, NewOffset, TRI)) + return; + + OffsetOp.setImm(NewOffset); + BaseOp.setReg(IncBase); +} + +void HexagonMachineUnroller::replacePostIncWithBaseOffset( + MachineBasicBlock *BB) const { + for (MachineBasicBlock::iterator I = BB->getFirstNonPHI(), + E = BB->getFirstTerminator(); + I != E;) { + MachineInstr *MI = &*I; + I++; + if (!HII->isPostIncrement(*MI)) + continue; + + replacePostIncWithBaseOffset(MI); + } +} + +void HexagonMachineUnroller::replacePostIncWithBaseOffset( + MachineInstr *MI) const { + if (!HII->isPostIncrement(*MI) || HII->isPredicated(*MI)) + return; + short NewOpcode = HII->changeAddrMode_pi_io(MI->getOpcode()); + if (NewOpcode < 0) + return; + + unsigned BasePos = 0, OffsetPos = 0; + if (!HII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) + return; + const MachineOperand &IncValue = MI->getOperand(OffsetPos); + const MachineOperand &IncBase = MI->getOperand(BasePos); + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + MachineOperand *IncDest; + MachineInstrBuilder MIB; + if (MI->mayLoad()) { + IncDest = &MI->getOperand(1); + const MachineOperand &LDValue = MI->getOperand(0); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(IncBase).addImm(0); + } else { + IncDest = &MI->getOperand(0); + const MachineOperand &STValue = MI->getOperand(3); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(IncBase).addImm(0).add(STValue); + } + + // Transfer memoperands. + MIB->setMemRefs(*MBB.getParent(), MI->memoperands()); + + MachineInstrBuilder MIBA = BuildMI(MBB, *MI, DL, HII->get(Hexagon::A2_addi)); + MIBA.add(*IncDest).add(IncBase).add(IncValue); + MI->eraseFromParent(); +} + + +// Convert post-inc addressing mode into base-offset along with an +// 'add' instruction that is used to increment the address. +// This is done to break dependence between post-increment memory operations +// in the unrolled version of the loop. 'add' instructions are later +// optimized out. +// Ex: +// original loop: +// v1 = phi(v0, v3) +// v2,v3 = post_load v1, 4 + +// Unrolling without optimizing post-increments: +// v1 = phi(v0, v3') +// v2,v3 = post_load v1, 4 +// v2',v3'= post_load v3, 4 + +// Instead, we want to have this: +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2 = load v3', -4 +// +void HexagonMachineUnroller::generatePostInc(MachineBasicBlock *BB) const { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MachineBasicBlock::iterator MII = BB->getFirstNonPHI(); + MachineBasicBlock::iterator MIE = BB->instr_begin(); + bool isOK = true; + while (MII != MIE) { + MachineInstr *Phi = &*std::prev(MII); + MII = std::prev(MII); + unsigned LoopVal = getLoopPhiReg(Phi, BB); + MachineInstr *LoopInst = MRI->getVRegDef(LoopVal); + if (!isAddWithImmValue(*LoopInst)) + continue; + + if (LoopInst->getOpcode() != Hexagon::A2_addi) + continue; + + unsigned AddReg = LoopInst->getOperand(1).getReg(); + int64_t AddImm = LoopInst->getOperand(2).getImm(); + SmallVector UseList; + MachineInstr *PostIncCandidate = nullptr; + + for (MachineRegisterInfo::use_iterator RI = MRI->use_begin(AddReg), + RE = MRI->use_end(); + RI != RE; ++RI) { + MachineOperand &MO = *RI; + MachineInstr *UseMI = MO.getParent(); + if (UseMI == LoopInst) + continue; + if (!dominates(UseMI, LoopInst)) { + isOK = false; + break; + } + unsigned BaseReg; + int64_t Offset; + if (!HII->isBaseImmOffset(*UseMI) || + !HII->getMemOpBaseRegImmOfs(*UseMI, BaseReg, Offset, TRI)) { + isOK = false; + break; + } + int64_t NewOffset = Offset - AddImm; + if (!isValidOffset(*UseMI, NewOffset, TRI) || BaseReg != AddReg) { + isOK = false; + break; + } + if (Offset == 0 && !PostIncCandidate) { + PostIncCandidate = UseMI; + continue; + } + UseList.push_back(UseMI); + } + + if (!isOK) + continue; + + // If a candidate is found, replace it with the post-inc instruction. + // Also, adjust offset for other uses as needed. + if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst)) + continue; + + for (auto UseMI : UseList) { + if (!dominates(PostIncCandidate, UseMI)) + continue; + unsigned BasePos, OffsetPos; + if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) { + // New offset has already been validated; no need to do it again. + int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm; + UseMI->getOperand(OffsetPos).setImm(NewOffset); + UseMI->getOperand(BasePos).setReg(LoopVal); + } + } + replaceWithPostInc(PostIncCandidate, LoopInst); + } +} + +bool HexagonMachineUnroller::canReplaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0) + return false; + assert(AddMI->getOpcode() == Hexagon::A2_addi); + return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm()); +} + +void HexagonMachineUnroller::replaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode()); + assert(NewOpcode >= 0 && + "Couldn't change base offset to post-increment form"); + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + const MachineOperand &IncDest = AddMI->getOperand(0); + const MachineOperand &IncBase = AddMI->getOperand(1); + const MachineOperand &IncValue = AddMI->getOperand(2); + MachineInstrBuilder MIB; + if (MI->mayLoad()) { + const MachineOperand &LDValue = MI->getOperand(0); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue); + } else { + const MachineOperand &STValue = MI->getOperand(2); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue); + } + + // Transfer memoperands. + MIB->setMemRefs(*MBB.getParent(), MI->memoperands()); + + MI->eraseFromParent(); + AddMI->eraseFromParent(); +} + +/// Remove instructions that generate values with no uses. +void HexagonMachineUnroller::removeDeadInstructions( + MachineBasicBlock &BB) const { + // For BB, check that the value defined by each instruction is used. + // If not, delete it. + for (MachineBasicBlock::reverse_instr_iterator MI = BB.instr_rbegin(), + ME = BB.instr_rend(); + MI != ME;) { + // From DeadMachineInstructionElem. Don't delete inline assembly. + if (MI->isInlineAsm()) { + ++MI; + continue; + } + bool SawStore = false; + // Check if it's safe to remove the instruction due to side effects. + if (!MI->isSafeToMove(nullptr, SawStore)) { + ++MI; + continue; + } + unsigned Uses = 0; + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); + MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef()) + continue; + unsigned reg = MOI->getReg(); + // Assume physical registers are used. + if (TargetRegisterInfo::isPhysicalRegister(reg)) { + Uses++; + continue; + } + if (MRI->use_begin(reg) != MRI->use_end()) + Uses++; + } + if (!Uses) { + MI++->eraseFromParent(); + continue; + } + ++MI; + } +} + +void HexagonMachineUnroller::optimize(MachineBasicBlock &BB) const { + replacePostIncWithBaseOffset(&BB); + foldAdds(BB); + generatePostInc(&BB); +} Index: lib/Target/Hexagon/HexagonTargetMachine.cpp =================================================================== --- lib/Target/Hexagon/HexagonTargetMachine.cpp +++ lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -15,6 +15,7 @@ #include "Hexagon.h" #include "HexagonISelLowering.h" #include "HexagonMachineScheduler.h" +#include "HexagonMachineUnroller.h" #include "HexagonTargetObjectFile.h" #include "HexagonTargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" @@ -109,6 +110,12 @@ extern "C" int HexagonTargetMachineModule; int HexagonTargetMachineModule = 0; +static MachineUnroller * +createHexagonMachineUnroller(MachineUnrollerContext *C) { + MachineUnroller *U = new HexagonMachineUnroller(C); + return U; +} + static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new VLIWMachineScheduler(C, make_unique()); @@ -291,6 +298,11 @@ return createVLIWMachineSched(C); } + MachineUnroller* + createMachineUnroller(MachineUnrollerContext *C) const override { + return createHexagonMachineUnroller(C); + } + void addIRPasses() override; bool addInstSelector() override; void addPreRegAlloc() override; Index: test/CodeGen/Hexagon/bit-gen-rseq.ll =================================================================== --- test/CodeGen/Hexagon/bit-gen-rseq.ll +++ test/CodeGen/Hexagon/bit-gen-rseq.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -disable-hsdr -hexagon-subreg-liveness < %s | FileCheck %s +; RUN: llc -march=hexagon -disable-hsdr -hexagon-subreg-liveness \ +; RUN: -enable-pipeliner-unroll=false < %s | FileCheck %s ; Check that we don't generate any bitwise operations. ; CHECK-NOT: = or( Index: test/CodeGen/Hexagon/hwloop4.ll =================================================================== --- test/CodeGen/Hexagon/hwloop4.ll +++ test/CodeGen/Hexagon/hwloop4.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck %s ; ; Remove the unnecessary 'add' instruction used for the hardware loop setup. Index: test/CodeGen/Hexagon/late_instr.ll =================================================================== --- test/CodeGen/Hexagon/late_instr.ll +++ test/CodeGen/Hexagon/late_instr.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -disable-hsdr < %s | FileCheck %s +; RUN: llc -march=hexagon -disable-hsdr -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck %s ; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets. ; CHECK: or(q{{[0-3]+}},q{{[0-3]+}}) Index: test/CodeGen/Hexagon/miunroll-optimize-memrefs1.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/miunroll-optimize-memrefs1.ll @@ -0,0 +1,93 @@ +; RUN: llc -O3 -march=hexagon -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck --check-prefix=CHECK-NO-UNROLL %s + +; RUN: llc -O3 -march=hexagon -enable-pipeliner-unroll=true \ +; RUN: < %s | FileCheck --check-prefix=CHECK-UNROLL %s + +; Without the machine unroller, make sure that the inner most loop has only one sfmpy instruction. + +; CHECK-NO-UNROLL: loop0(.LBB0_[[LOOP:.]] +; CHECK-NO-UNROLL: .LBB0_[[LOOP]]: +; CHECK-NO-UNROLL: { +; CHECK-NO-UNROLL-DAG: { +; CHECK-NO-UNROLL-DAG: sfmpy +; CHECK-NO-UNROLL-NOT: sfmpy +; CHECK-NO-UNROLL: endloop0 +; CHECK-NO-UNROLL-NOT: loop0 + +; When the machine unroller is enabled, the inner most loop in the test +; gets unrolled by 2. Make sure that there are only 3 packets and +; 2 sfmpy instructions (one for each loop iteration) in the unrolled loop. + +; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]] +; CHECK-UNROLL: .LBB0_[[LOOP]]: +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL-NOT: sfmpy +; CHECK-UNROLL: } :endloop0 + + +%struct.loops_params_s = type { i32, i32, i32, i32, i32, i32, i32, [32 x i32], [32 x i32], i32, i32, i32, i32, i32, i8*, i32, i32, float**, i32**, float*, float**, float**, float*, i32*, i8*, %struct.intparts_s*, float, float*, float*, i32 } +%struct.intparts_s = type { i8, i16, i32, i32 } + +; Function Attrs: nounwind +define float @inner_product(%struct.loops_params_s* %p) { +entry: + %v = getelementptr inbounds %struct.loops_params_s, %struct.loops_params_s* %p, i32 0, i32 17 + %0 = load float**, float*** %v, align 4 + %1 = load float*, float** %0, align 4 + %arrayidx2 = getelementptr inbounds float*, float** %0, i32 1 + %2 = load float*, float** %arrayidx2, align 4 + %N = getelementptr inbounds %struct.loops_params_s, %struct.loops_params_s* %p, i32 0, i32 5 + %3 = load i32, i32* %N, align 4 + %Loop = getelementptr inbounds %struct.loops_params_s, %struct.loops_params_s* %p, i32 0, i32 9 + %4 = load i32, i32* %Loop, align 4 + %vsize = getelementptr inbounds %struct.loops_params_s, %struct.loops_params_s* %p, i32 0, i32 1 + %5 = load i32, i32* %vsize, align 4 + %call = tail call i32 bitcast (i32 (...)* @reinit_vec to i32 (%struct.loops_params_s*, float*, i32)*)(%struct.loops_params_s* %p, float* %1, i32 %5) + %6 = load i32, i32* %vsize, align 4 + %call4 = tail call i32 bitcast (i32 (...)* @reinit_vec to i32 (%struct.loops_params_s*, float*, i32)*)(%struct.loops_params_s* %p, float* %2, i32 %6) + %cmp39 = icmp slt i32 %4, 1 + br i1 %cmp39, label %for.end13, label %for.body.lr.ph + +for.body.lr.ph: + %cmp636 = icmp sgt i32 %3, 0 + br label %for.body + +for.body: + %q.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %q.1.lcssa, %for.inc11 ] + %l.040 = phi i32 [ 1, %for.body.lr.ph ], [ %inc12, %for.inc11 ] + br i1 %cmp636, label %for.body7.lr.ph, label %for.inc11 + +for.body7.lr.ph: + %arrayidx8.gep = getelementptr float, float* %2, i32 %l.040 + br label %for.body7 + +for.body7: + %q.138 = phi float [ %q.042, %for.body7.lr.ph ], [ %add10, %for.body7 ] + %arrayidx8.phi = phi float* [ %arrayidx8.gep, %for.body7.lr.ph ], [ %arrayidx8.inc, %for.body7 ] + %arrayidx9.phi = phi float* [ %1, %for.body7.lr.ph ], [ %arrayidx9.inc, %for.body7 ] + %k.037 = phi i32 [ 0, %for.body7.lr.ph ], [ %inc, %for.body7 ] + %7 = load float, float* %arrayidx8.phi, align 4 + %8 = load float, float* %arrayidx9.phi, align 4 + %mul = fmul float %7, %8 + %add10 = fadd float %q.138, %mul + %inc = add nuw nsw i32 %k.037, 1 + %exitcond = icmp eq i32 %inc, %3 + %arrayidx8.inc = getelementptr float, float* %arrayidx8.phi, i32 32 + %arrayidx9.inc = getelementptr float, float* %arrayidx9.phi, i32 32 + br i1 %exitcond, label %for.inc11, label %for.body7 + +for.inc11: + %q.1.lcssa = phi float [ %q.042, %for.body ], [ %add10, %for.body7 ] + %inc12 = add nuw nsw i32 %l.040, 1 + %exitcond44 = icmp eq i32 %l.040, %4 + br i1 %exitcond44, label %for.end13, label %for.body + +for.end13: + %q.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %q.1.lcssa, %for.inc11 ] + ret float %q.0.lcssa +} + +declare i32 @reinit_vec(...) local_unnamed_addr #0 + Index: test/CodeGen/Hexagon/miunroll-optimize-memrefs2.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/miunroll-optimize-memrefs2.ll @@ -0,0 +1,65 @@ +; RUN: llc -O3 -march=hexagon -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck --check-prefix=CHECK-NO-UNROLL %s + +; RUN: llc -O3 -march=hexagon -enable-pipeliner-unroll=true \ +; RUN: < %s | FileCheck --check-prefix=CHECK-UNROLL %s + +; Without the machine unroller, check that the inner most loop has only one sfmpy instruction. + +; CHECK-NO-UNROLL: loop0(.LBB0_[[LOOP:.]] +; CHECK-NO-UNROLL: .LBB0_[[LOOP]]: +; CHECK-NO-UNROLL: { +; CHECK-NO-UNROLL-DAG: { +; CHECK-NO-UNROLL-DAG: sfmpy +; CHECK-NO-UNROLL-NOT: sfmpy +; CHECK-NO-UNROLL: endloop0 +; CHECK-NO-UNROLL-NOT: loop0 + +; When the machine unroller is enabled, the inner most loop in the test +; gets unrolled by 4. Make sure that there are only 4 packets and +; 4 sfmpy instructions (one for each loop iteration) in the unrolled loop. + +; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]] +; CHECK-UNROLL: .LBB0_[[LOOP]]: +; CHECK-UNROLL: { +; CHECK-UNROLL-NOT: { +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL: { +; CHECK-UNROLL-NOT: { +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL: { +; CHECK-UNROLL-NOT: { +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL: { +; CHECK-UNROLL-NOT: { +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL-NOT: { +; CHECK-UNROLL: } :endloop0 +; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]] + +; Function Attrs: norecurse nounwind readonly +define float @PolyEval_horner(float %pt, i32 %degree, float* noalias nocapture readonly %coeff) local_unnamed_addr { +entry: + %arrayidx = getelementptr inbounds float, float* %coeff, i32 %degree + %0 = load float, float* %arrayidx, align 4 + %tobool8 = icmp eq i32 %degree, 0 + br i1 %tobool8, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %sum.010 = phi float [ %add, %while.body ], [ %0, %while.body.preheader ] + %i.09 = phi i32 [ %sub, %while.body ], [ %degree, %while.body.preheader ] + %mul = fmul contract float %sum.010, %pt + %sub = add i32 %i.09, -32 + %arrayidx1 = getelementptr inbounds float, float* %coeff, i32 %sub + %1 = load float, float* %arrayidx1, align 4 + %add = fadd contract float %mul, %1 + %tobool = icmp eq i32 %sub, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: + %sum.0.lcssa = phi float [ %0, %entry ], [ %add, %while.body ] + ret float %sum.0.lcssa +} Index: test/CodeGen/Hexagon/miunroll-update-memoperands.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/miunroll-update-memoperands.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=hexagon -O3 -enable-pipeliner-unroll=true < %s +; REQUIRES: asserts + +; This test used to fail with an "UNREACHABLE" executed in Machine Unroller due to a bug +; in computeDelta function. + +%class.mrObjectRecord = type { i32, i32, %class.mrSurfaceList, i32, i32, i32, i32, i32, i32 } +%class.mrSurfaceList = type { %class.ggSolidTexture, %class.ggTrain } +%class.ggSolidTexture = type { i32 (...)** } +%class.ggTrain = type { %class.ggSolidTexture**, i32, i32 } + +declare i32 @__gxx_personality_v0(...) + +; Function Attrs: nobuiltin +declare void @_Znaj() local_unnamed_addr + +; Function Attrs: norecurse +declare dso_local fastcc %class.mrObjectRecord* @_ZN12ggDictionaryI14mrObjectRecordE6lookUpERK8ggString() unnamed_addr align 2 + +; Function Attrs: norecurse +define dso_local fastcc void @_ZN7mrScene9AddObjectEP9mrSurfaceRK8ggStringS4_i() unnamed_addr align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + br i1 undef, label %_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit, label %while.body.i.i.lr.ph + +while.body.i.i.lr.ph: + unreachable + +_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit: + %call5 = tail call fastcc %class.mrObjectRecord* @_ZN12ggDictionaryI14mrObjectRecordE6lookUpERK8ggString() + br i1 undef, label %if.then7, label %if.end11 + +if.then7: + invoke void @_Znaj() + to label %invoke.cont unwind label %lpad + +invoke.cont: + br label %if.end11 + +lpad: + %0 = landingpad { i8*, i32 } + cleanup + resume { i8*, i32 } %0 + +if.end11: + %recPtr.0 = phi %class.mrObjectRecord* [ %call5, %_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit ], [ undef, %invoke.cont ] + %surfaces.i.i7 = getelementptr inbounds %class.mrObjectRecord, %class.mrObjectRecord* %recPtr.0, i32 0, i32 2, i32 1 + %data.i.i.i11 = getelementptr inbounds %class.ggTrain, %class.ggTrain* %surfaces.i.i7, i32 0, i32 0 + br label %for.body.i.i.i + +for.cond.cleanup.i.i.i: + ret void + +for.body.i.i.i: + %i.0.i.i.i52 = phi i32 [ %inc.i.i.i, %for.body.i.i.i ], [ 0, %if.end11 ] + %1 = load i32, i32* undef, align 4 + %2 = load %class.ggSolidTexture**, %class.ggSolidTexture*** %data.i.i.i11, align 4 + %arrayidx9.i.i.i = getelementptr inbounds %class.ggSolidTexture*, %class.ggSolidTexture** %2, i32 %i.0.i.i.i52 + %3 = bitcast %class.ggSolidTexture** %arrayidx9.i.i.i to i32* + store i32 %1, i32* %3, align 4 + %inc.i.i.i = add nuw nsw i32 %i.0.i.i.i52, 1 + %cmp7.i.i.i = icmp slt i32 %inc.i.i.i, undef + br i1 %cmp7.i.i.i, label %for.body.i.i.i, label %for.cond.cleanup.i.i.i +} + Index: test/CodeGen/Hexagon/miunroll-update-offset.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/miunroll-update-offset.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=hexagon -O3 -enable-pipeliner-unroll=true < %s | FileCheck %s + +; After machine unrolling the loop, make sure that all base+offset loads +; use correct base and offset values. + +; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: .LBB0_[[LOOP]]: +; CHECK: memh([[REG1:(r[0-9]+)]]+#{{[0-9]+}}) = r{{[0-9]+}} +; CHECK-DAG: memh([[REG1]]+#{{32|0}}) = r{{[0-9]+}} +; CHECK-DAG: memh([[REG1]]+#64) = r{{[0-9]+}} +; CHECK-DAG: memh([[REG1]]+#96) = r{{[0-9]+}} +; CHECK: endloop0 + +%struct.csGroup = type { i32, i32, i32, i16, i16, i16, i16, i16, i16, i16, i16, i16} + +@numRows = external local_unnamed_addr global i32, align 4 +@MPG = common local_unnamed_addr global i32 0, align 4 +@groupArray = common local_unnamed_addr global %struct.csGroup* null, align 4 +@numGroups = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: nounwind +define i32 @globe() local_unnamed_addr { +entry: + %0 = load i32, i32* @numRows, align 4 + %add = shl i32 %0, 1 + %add1 = add i32 %add, 6 + store i32 %add1, i32* @MPG, align 4 + %1 = mul i32 %0, 72 + %mul3 = add i32 %1, 252 + %call = tail call i32 bitcast (i32 (...)* @safe_malloc to i32 (i32)*)(i32 %mul3) #2 + %2 = inttoptr i32 %call to %struct.csGroup* + store %struct.csGroup* %2, %struct.csGroup** @groupArray, align 4 + %3 = load i32, i32* @numGroups, align 4 + %cmp10 = icmp slt i32 %3, 1 + br i1 %cmp10, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %group.011 = phi i32 [ %inc, %for.body ], [ 1, %for.body.preheader ] + %conv = trunc i32 %group.011 to i16 + %flag = getelementptr inbounds %struct.csGroup, %struct.csGroup* %2, i32 %group.011, i32 11 + store i16 %conv, i16* %flag, align 4 + %inc = add nuw nsw i32 %group.011, 1 + %cmp = icmp slt i32 %group.011, %3 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret i32 undef +} + +declare i32 @safe_malloc(...) local_unnamed_addr Index: test/CodeGen/Hexagon/miunroll.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/miunroll.ll @@ -0,0 +1,55 @@ +; RUN: llc -O3 -march=hexagon -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck --check-prefix=CHECK-NO-UNROLL %s + +; RUN: llc -O3 -march=hexagon -enable-pipeliner-unroll=true \ +; RUN: < %s | FileCheck --check-prefix=CHECK-UNROLL %s + +; Make sure that there's only one hardware loop when the machine unroller is disabled. +; CHECK-NO-UNROLL: loop0(.LBB0_[[LOOP:.]] +; CHECK-NO-UNROLL: .LBB0_[[LOOP]]: +; CHECK-NO-UNROLL: sfmpy +; CHECK-NO-UNROLL-NOT: sfmpy +; CHECK-NO-UNROLL: endloop0 +; CHECK-NO-UNROLL-NOT: loop0 + +; Make sure that there are multiple hardware loops when the machine unroller is enabled, one for the unrolled loop and another for the remainder loop. +; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]] +; CHECK-UNROLL: .LBB0_[[LOOP]]: +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL: sfmpy +; CHECK-UNROLL: endloop0 + +define float @test(i32 %n, float %da, float* noalias nocapture readonly %dx, i32 %incx, float* noalias nocapture %dy, i32 %incy) local_unnamed_addr { +entry: + %cmp = icmp slt i32 %n, 1 + %cmp1 = fcmp oeq float %da, 0.000000e+00 + %or.cond45 = or i1 %cmp, %cmp1 + br i1 %or.cond45, label %if.then6, label %if.end3 + +if.end3: + %cmp4 = icmp ne i32 %incx, 1 + %cmp5 = icmp ne i32 %incy, 1 + %or.cond = or i1 %cmp4, %cmp5 + br i1 %or.cond, label %if.then6, label %for.body.lr.ph + +if.then6: + ret float 0.000000e+00 + +for.body.lr.ph: + %0 = load float, float* %dy, align 4 + br label %for.body + +for.body: + %arrayidx18.phi = phi float* [ %dx, %for.body.lr.ph ], [ %arrayidx18.inc, %for.body ] + %arrayidx21.phi = phi float* [ %dy, %for.body.lr.ph ], [ %arrayidx21.inc, %for.body ] + %i.047 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %1 = load float, float* %arrayidx18.phi, align 4 + %mul19 = fmul float %1, %da + %add20 = fadd float %0, %mul19 + store float %add20, float* %arrayidx21.phi, align 4 + %inc = add nuw nsw i32 %i.047, 1 + %exitcond = icmp eq i32 %inc, %n + %arrayidx18.inc = getelementptr float, float* %arrayidx18.phi, i32 32 + %arrayidx21.inc = getelementptr float, float* %arrayidx21.phi, i32 32 + br i1 %exitcond, label %if.then6, label %for.body +} Index: test/CodeGen/Hexagon/no-packets.ll =================================================================== --- test/CodeGen/Hexagon/no-packets.ll +++ test/CodeGen/Hexagon/no-packets.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner-unroll=false < %s | FileCheck %s ; Check that there are no packets with two or more instructions, except ; for the endloop packet. Index: test/CodeGen/Hexagon/simplify64bitops_7223.ll =================================================================== --- test/CodeGen/Hexagon/simplify64bitops_7223.ll +++ test/CodeGen/Hexagon/simplify64bitops_7223.ll @@ -1,6 +1,9 @@ -; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner=false \ +; RUN: -enable-pipeliner-unroll=false < %s | FileCheck %s + ; RUN: llc -march=hexagon -enable-pipeliner < %s ; REQUIRES: asserts + ; CHECK-NOT: and( ; CHECK-NOT: or( ; CHECK-NOT: combine(0 Index: test/CodeGen/Hexagon/swp-carried-1.ll =================================================================== --- test/CodeGen/Hexagon/swp-carried-1.ll +++ test/CodeGen/Hexagon/swp-carried-1.ll @@ -1,4 +1,6 @@ -; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched \ +; RUN: -enable-pipeliner-unroll=false -hexagon-initial-cfg-cleanup=0 \ +; RUN: < %s | FileCheck %s ; Test that we generate the correct code when a loop carried value ; is scheduled one stage earlier than it's use. The code in Index: test/CodeGen/Hexagon/swp-change-deps.ll =================================================================== --- test/CodeGen/Hexagon/swp-change-deps.ll +++ test/CodeGen/Hexagon/swp-change-deps.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner-unroll=false \ +; RUN: -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s ; Test that we generate the correct offsets for loads in the prolog ; after removing dependences on a post-increment instructions of the Index: test/CodeGen/Hexagon/swp-epilog-numphis.ll =================================================================== --- test/CodeGen/Hexagon/swp-epilog-numphis.ll +++ test/CodeGen/Hexagon/swp-epilog-numphis.ll @@ -1,6 +1,6 @@ ; XFAIL: * ; Needs some fixed in the pipeliner. -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner-unroll=false < %s | FileCheck %s ; CHECK: endloop0 ; CHECK: vmem Index: test/CodeGen/Hexagon/swp-epilog-phi9.ll =================================================================== --- test/CodeGen/Hexagon/swp-epilog-phi9.ll +++ test/CodeGen/Hexagon/swp-epilog-phi9.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner-unroll=false \ +; RUN: -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s ; Test that we generate the correct Phi name in the last couple of epilog ; blocks, when there are 3 epilog blocks. The Phi was scheduled in stage Index: test/CodeGen/Hexagon/swp-max.ll =================================================================== --- test/CodeGen/Hexagon/swp-max.ll +++ test/CodeGen/Hexagon/swp-max.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \ -; RUN: -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: -pipeliner-max-stages=2 -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck %s @A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8 Index: test/CodeGen/Hexagon/swp-multi-loops.ll =================================================================== --- test/CodeGen/Hexagon/swp-multi-loops.ll +++ test/CodeGen/Hexagon/swp-multi-loops.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \ +; RUN: -enable-pipeliner-unroll=false < %s | FileCheck %s ; Make sure we attempt to pipeline all inner most loops. Index: test/CodeGen/Hexagon/swp-vsum.ll =================================================================== --- test/CodeGen/Hexagon/swp-vsum.ll +++ test/CodeGen/Hexagon/swp-vsum.ll @@ -1,5 +1,11 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s --check-prefix=CHECKV60 +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \ +; RUN: -enable-pipeliner-unroll=false < %s | FileCheck %s + +; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 -enable-pipeliner-unroll=false \ +; RUN: < %s | FileCheck %s + +; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner \ +; RUN: -enable-pipeliner-unroll=false < %s | FileCheck %s --check-prefix=CHECKV60 ; Simple vector total. ; CHECK: loop0(.LBB0_[[LOOP:.]], Index: test/CodeGen/Hexagon/swp-xxh2.ll =================================================================== --- test/CodeGen/Hexagon/swp-xxh2.ll +++ test/CodeGen/Hexagon/swp-xxh2.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -enable-pipeliner-unroll=false \ +; RUN: -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s ; REQUIRES: asserts ; Fix bug when pipelining xxh benchmark at O3, mv55, and with vectorization.