Index: llvm/include/llvm/Analysis/LoopInfo.h =================================================================== --- llvm/include/llvm/Analysis/LoopInfo.h +++ llvm/include/llvm/Analysis/LoopInfo.h @@ -441,7 +441,9 @@ /// header then the node is returned. If any latch instruction does not /// contain llvm.loop or or if multiple latches contain different nodes then /// 0 is returned. - MDNode *getLoopID() const; + std::pair getLoopIDWithInstr() const; + MDNode *getLoopID() const { return getLoopIDWithInstr().first; } + /// Set the llvm.loop loop id metadata for this loop. /// /// The LoopID metadata node will be added to each terminator instruction in Index: llvm/include/llvm/IR/Instruction.h =================================================================== --- llvm/include/llvm/IR/Instruction.h +++ llvm/include/llvm/IR/Instruction.h @@ -188,6 +188,10 @@ getAllMetadataOtherThanDebugLocImpl(MDs); } + /// Get all the metadata IDs attached to this Instruction except the ones + /// associated with debug location. The Vector \p MDIDs is not cleared. + void getAllNonDebugMetadataIDs(SmallVectorImpl &MDIDs) const; + /// Fills the AAMDNodes structure with AA metadata from this instruction. /// When Merge is true, the existing AA metadata is merged with that from this /// instruction providing the most-general result. Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -183,7 +183,7 @@ // // LoopRotate - This pass is a simple loop rotating pass. // -Pass *createLoopRotatePass(int MaxHeaderSize = -1); +Pass *createLoopRotatePass(); //===----------------------------------------------------------------------===// // Index: llvm/lib/Analysis/LoopInfo.cpp =================================================================== --- llvm/lib/Analysis/LoopInfo.cpp +++ llvm/lib/Analysis/LoopInfo.cpp @@ -203,10 +203,12 @@ return true; } -MDNode *Loop::getLoopID() const { +std::pair Loop::getLoopIDWithInstr() const { MDNode *LoopID = nullptr; + Instruction *I = nullptr; if (isLoopSimplifyForm()) { - LoopID = getLoopLatch()->getTerminator()->getMetadata(LLVMContext::MD_loop); + I = getLoopLatch()->getTerminator(); + LoopID = I->getMetadata(LLVMContext::MD_loop); } else { // Go through each predecessor of the loop header and check the // terminator for the metadata. @@ -218,17 +220,19 @@ TerminatorInst *TI = BB->getTerminator(); if (MDNode *MD = TI->getMetadata(LLVMContext::MD_loop)) { - if (!LoopID) + if (!LoopID) { LoopID = MD; + I = TI; + } else if (MD != LoopID) // Multiple MD_loop found => corrupt metadata. - return nullptr; + return { nullptr, nullptr }; } } } if (!LoopID || LoopID->getNumOperands() == 0 || LoopID->getOperand(0) != LoopID) - return nullptr; - return LoopID; + return { nullptr, nullptr }; + return { LoopID, I }; } void Loop::setLoopID(MDNode *LoopID) const { Index: llvm/lib/IR/LLVMContextImpl.h =================================================================== --- llvm/lib/IR/LLVMContextImpl.h +++ llvm/lib/IR/LLVMContextImpl.h @@ -993,6 +993,7 @@ /// Copies all the current attachments into \c Result, sorting by attachment /// ID. This function does \em not clear \c Result. void getAll(SmallVectorImpl> &Result) const; + void getAllIDs(SmallVectorImpl &Result) const; /// \brief Erase matching attachments. /// Index: llvm/lib/IR/Metadata.cpp =================================================================== --- llvm/lib/IR/Metadata.cpp +++ llvm/lib/IR/Metadata.cpp @@ -1106,6 +1106,11 @@ array_pod_sort(Result.begin(), Result.end()); } +void MDAttachmentMap::getAllIDs(SmallVectorImpl &Result) const { + for (auto &A : Attachments) + Result.push_back(A.first); +} + void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) { Attachments.push_back({ID, TrackingMDNodeRef(&MD)}); } @@ -1154,14 +1159,13 @@ } void Instruction::dropUnknownNonDebugMetadata(ArrayRef KnownIDs) { - SmallSet KnownSet; - KnownSet.insert(KnownIDs.begin(), KnownIDs.end()); - if (!hasMetadataHashEntry()) return; // Nothing to remove! auto &InstructionMetadata = getContext().pImpl->InstructionMetadata; + SmallSet KnownSet; + KnownSet.insert(KnownIDs.begin(), KnownIDs.end()); if (KnownSet.empty()) { // Just drop our entry at the store. InstructionMetadata.erase(this); @@ -1270,6 +1274,15 @@ Info.getAll(Result); } +void +Instruction::getAllNonDebugMetadataIDs(SmallVectorImpl &MDIDs) const { + if (!hasMetadataOtherThanDebugLoc()) + return; + const auto &Info = getContext().pImpl->InstructionMetadata.find(this)->second; + assert(!Info.empty() && "Shouldn't have called this"); + Info.getAllIDs(MDIDs); +} + bool Instruction::extractProfMetadata(uint64_t &TrueVal, uint64_t &FalseVal) const { assert( Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -303,8 +303,10 @@ MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - // Rotate Loop - disable header duplication at -Oz - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + + // Rotate Loop - disable loop rotation at -Oz + if (SizeLevel != 2) + MPM.add(createLoopRotatePass()); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); MPM.add(createCFGSimplificationPass()); @@ -545,7 +547,8 @@ // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies // on the rotated form. Disable header duplication at -Oz. - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + if (SizeLevel != 2) + MPM.add(createLoopRotatePass()); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is Index: llvm/lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -7,18 +7,23 @@ // //===----------------------------------------------------------------------===// // -// This file implements Loop Rotation Pass. +// This file implements Loop Rotation Pass: +// 1. Canonicalize loop latch to have only one successor. +// 2. Clone all the BBs which are exiting the loop. +// 3. Adjust phi of cloned BBs +// 4. Add phi to the new loop header +// 5. Update DT // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopRotation.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/LoopPassManager.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -30,592 +35,620 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" + +#include + using namespace llvm; #define DEBUG_TYPE "loop-rotate" -static cl::opt DefaultRotationThreshold( - "rotation-max-header-size", cl::init(16), cl::Hidden, - cl::desc("The default maximum header size for automatic loop rotation")); +static cl::opt RotationMaxSize( + "rotation-max-size", cl::init(100), cl::Hidden, + cl::desc("The default maximum loop size for automatic loop rotation")); + +static cl::opt MaxExits("lr-max-exits", cl::init(10), cl::Hidden, + cl::desc("The maximum exits to be cloned")); + +static cl::opt RotationsMaxAllowed( + "lr-max-allowed", cl::init(1), cl::Hidden, + cl::desc("The maximum rotations of a loop")); STATISTIC(NumRotated, "Number of loops rotated"); -namespace { +// Insert \p NewBB in between \p PredBefore and \p Succ, and redirect edges +// accordingly. PredBefore -> NewBB -> Succ. Also move NewBB before Succ. +static void insertBetween(BasicBlock *NewBB, BasicBlock *PredBefore, + BasicBlock *Succ) { + BranchInst *NewBI = BranchInst::Create(Succ, NewBB); + NewBI->setDebugLoc(PredBefore->getTerminator()->getDebugLoc()); + + BranchInst *BLI = dyn_cast(PredBefore->getTerminator()); + for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I) + if (BLI->getSuccessor(I) == Succ) { + BLI->setSuccessor(I, NewBB); + break; + } + // Move NewBB physically from the end of the block list. + Function *F = Succ->getParent(); + F->getBasicBlockList().splice(Succ->getIterator(), F->getBasicBlockList(), + NewBB); +} + +// Remove the arguments of all phi nodes in PhiBB coming from block From. +static void discardIncomingValues(BasicBlock *PhiBB, BasicBlock *From) { + for (Instruction &I : *PhiBB) { + PHINode *PN = dyn_cast(&I); + if (!PN) + break; + PN->removeIncomingValue(PN->getBasicBlockIndex(From)); + } +} + +/// Returns true of the region formed by [Entry, Exit] is a +/// single-entry-single-exit (SESE) region. All the traces from \p Entry to +/// \p Exit should be dominated by \p Entry and post-dominated by \p Exit. +static bool isSingleEntrySingleExit(const BasicBlock *Entry, + const BasicBlock *Exit, + DominatorTree *DT, DominatorTree *PDT) { + if (!DT->dominates(Entry, Exit)) + return false; + + if (!PDT->dominates(Exit, Entry)) + return false; + + for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) { + if (*I == Exit) { + I.skipChildren(); + continue; + } + if (!DT->dominates(Entry, *I)) + return false; + ++I; + } + return true; +} + +/// Returns true of the region formed by [Entry, Exit] is a +/// single-entry-multiple-exit (SEME) region. All the traces from \p Entry +/// which leads to the \p Exit are analyzed. +static bool isSingleEntryMultipleExit(const BasicBlock *Entry, + const BasicBlock *Exit, + DominatorTree *DT) { + if (!DT->dominates(Entry, Exit)) + return false; + + for (auto I = idf_begin(Exit), E = idf_end(Exit); I != E;) { + if (*I == Entry) { + I.skipChildren(); + continue; + } + + if (!DT->dominates(Entry, *I)) + return false; + + ++I; + } + return true; +} + /// A simple loop rotation transformation. class LoopRotate { - const unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; AssumptionCache *AC; DominatorTree *DT; ScalarEvolution *SE; + Loop *L; + const char *LoopRotatedTimes = "llvm.loop.rotated.times"; + typedef SmallVectorImpl SmallVecBB; + typedef SmallPtrSetImpl SmallPtrSetBB; public: - LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - DominatorTree *DT, ScalarEvolution *SE) - : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) { - } - bool processLoop(Loop *L); + LoopRotate(LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE, Loop *L) + : LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), L(L) {} + bool processLoop(); private: - bool rotateLoop(Loop *L, bool SimplifiedLatch); - bool simplifyLoopLatch(Loop *L); -}; -} // end anonymous namespace - -/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the -/// old header into the preheader. If there were uses of the values produced by -/// these instruction that were outside of the loop, we have to insert PHI nodes -/// to merge the two values. Do this now. -static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, - BasicBlock *OrigPreheader, - ValueToValueMapTy &ValueMap) { - // Remove PHI node entries that are no longer live. - BasicBlock::iterator I, E = OrigHeader->end(); - for (I = OrigHeader->begin(); PHINode *PN = dyn_cast(I); ++I) - PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); - - // Now fix up users of the instructions in OrigHeader, inserting PHI nodes - // as necessary. - SSAUpdater SSA; - for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = &*I; - - // If there are no uses of the value (e.g. because it returns void), there - // is nothing to rewrite. - if (OrigHeaderVal->use_empty()) - continue; + void rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const; - Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); - - // The value now exits in two versions: the initial value in the preheader - // and the loop "next" value in the original header. - SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); - SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); - SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); - - // Visit each use of the OrigHeader instruction. - for (Value::use_iterator UI = OrigHeaderVal->use_begin(), - UE = OrigHeaderVal->use_end(); - UI != UE;) { - // Grab the use before incrementing the iterator. - Use &U = *UI; - - // Increment the iterator before removing the use from the list. - ++UI; - - // SSAUpdater can't handle a non-PHI use in the same block as an - // earlier def. We can easily handle those cases manually. - Instruction *UserInst = cast(U.getUser()); - if (!isa(UserInst)) { - BasicBlock *UserBB = UserInst->getParent(); - - // The original users in the OrigHeader are already using the - // original definitions. - if (UserBB == OrigHeader) - continue; - - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped. - if (UserBB == OrigPreheader) { - U = OrigPreHeaderVal; - continue; - } - } + bool isProfitableToRotate(const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const; - // Anything else can be handled by SSAUpdater. - SSA.RewriteUse(U); - } + bool isLegalToRotate() const; - // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug - // intrinsics. - LLVMContext &C = OrigHeader->getContext(); - if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) { - if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) { - for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) { - // Grab the use before incrementing the iterator. Otherwise, altering - // the Use will invalidate the iterator. - Use &U = *UI++; - DbgInfoIntrinsic *UserInst = dyn_cast(U.getUser()); - if (!UserInst) - continue; + void adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH, + BasicBlock *NewPH) const; - // The original users in the OrigHeader are already using the original - // definitions. - BasicBlock *UserBB = UserInst->getParent(); - if (UserBB == OrigHeader) - continue; + BasicBlock *collectSEMEBlocks(BasicBlock *OrigH, BasicBlock *OrigLatch, + SmallVecBB &Blocks, SmallPtrSetBB &Exits) const; - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped and anything else can be handled by - // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute undef. - Value *NewVal; - if (UserBB == OrigPreheader) - NewVal = OrigPreHeaderVal; - else if (SSA.HasValueForBlock(UserBB)) - NewVal = SSA.GetValueInMiddleOfBlock(UserBB); - else - NewVal = UndefValue::get(OrigHeaderVal->getType()); - U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal)); - } - } - } - } -} + void addNewPhisToNewHeader(const SmallVecBB &Blocks, BasicBlock *NewHeader, + BasicBlock *NewPreheader, BasicBlock *NewLatch, + ValueToValueMapTy &VMap) const; +}; -/// Rotate loop LP. Return true if the loop is rotated. -/// -/// \param SimplifiedLatch is true if the latch was just folded into the final -/// loop exit. In this case we may want to rotate even though the new latch is -/// now an exiting branch. This rotation would have happened had the latch not -/// been simplified. However, if SimplifiedLatch is false, then we avoid -/// rotating loops in which the latch exits to avoid excessive or endless -/// rotation. LoopRotate should be repeatable and converge to a canonical -/// form. This property is satisfied because simplifying the loop latch can only -/// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { +// Check if there is something preventing loop from being rotated. +bool LoopRotate::isLegalToRotate() const { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); + const BasicBlock *LoopLatch = L->getLoopLatch(); + if (!LoopLatch || isa(LoopLatch->getTerminator())) + return false; - BranchInst *BI = dyn_cast(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) + const BasicBlock *OrigH = L->getHeader(); + // The header may have invoke instruction to BBs within the loop. + const BranchInst *BI = dyn_cast(OrigH->getTerminator()); + if (!BI || !BI->isConditional()) return false; // If the loop header is not one of the loop exiting blocks then // either this loop is already rotated or it is not // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) + if (!L->isLoopExiting(OrigH)) return false; - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) return false; - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch) + if (!isSingleEntryMultipleExit(OrigH, LoopLatch, DT)) return false; + return true; +} + +// Check if the size of rotated loop will be within limits. +bool LoopRotate::isProfitableToRotate(const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const { // Check size of original header and reject loop if it is very big or we can't // duplicate blocks inside it. - { - SmallPtrSet EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - + if (Exits.size() > MaxExits) + return false; + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + unsigned LoopSize = 0; + for (const BasicBlock *BB : Blocks) { CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); + Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + // TODO: Modify this because there might be blocks with indirectbr, invoke + // in the loop but we can cut the cloning part at that point and that will + // be the last exit BB. if (Metrics.notDuplicatable) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" << " instructions: "; L->dump()); return false; } + if (Metrics.convergent) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " "instructions: "; L->dump()); return false; } - if (Metrics.NumInsts > MaxHeaderSize) + + // TODO: Once this is crossed we can copy until the previous BB. + if (LoopSize + Metrics.NumInsts >= RotationMaxSize) return false; + LoopSize += Metrics.NumInsts; } + return true; +} - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); +// Add phis to the new header and adjust the phi nodes from the OrigHeader. +void LoopRotate::addNewPhisToNewHeader(const SmallVecBB &Blocks, + BasicBlock *NewHeader, + BasicBlock *NewPreheader, + BasicBlock *NewLatch, + ValueToValueMapTy &VMap) const { + // Add to NewHeader phi nodes for all copied variables which are used. + for (BasicBlock *BB : Blocks) { + for (Instruction &Inst : *BB) { + // Skip Ins with no use e.g., branches. + if (Inst.use_begin() == Inst.use_end()) + continue; - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader) - return false; + for (auto UI = Inst.use_begin(), E = Inst.use_end(); UI != E;) { + Use &U = *UI++; + Instruction *UserInst = cast(U.getUser()); - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. - if (SE) - SE->forgetLoop(L); + // Nothing to rename when the use is dominated by the definition. + if (DT->dominates(&Inst, UserInst)) + continue; - DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + if (!L->contains(UserInst->getParent())) { + // Handle uses in the loop-closed-phi. + PHINode *ClosePhi = cast(UserInst); + BasicBlock *Pred = ClosePhi->getIncomingBlock(U.getOperandNo()); - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - if (L->contains(Exit)) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast(I); ++I) - ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); - - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !isa(Inst) && - !isa(Inst) && !isa(Inst)) { - Inst->moveBefore(LoopEntryBranch); - continue; - } + // Do not rename a loop close phi node if its predecessor in the loop + // is dominated by Inst. + if (L->contains(Pred) && DT->dominates(BB, Pred)) + continue; + } + + PHINode *PN = + PHINode::Create(Inst.getType(), 2, "phi.nh", &*NewHeader->begin()); + PN->addIncoming(&Inst, NewLatch); + PN->addIncoming(cast(VMap[&Inst]), NewPreheader); - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - // FIXME: Provide TLI, DT, AC to SimplifyInstruction. - Value *V = SimplifyInstruction(C, DL); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - ValueMap[Inst] = V; - if (!C->mayHaveSideEffects()) { - delete C; - C = nullptr; + // When Inst does not dominate U, it is going to use the updated + // definition coming from PN. + U.set(PN); } - } else { - ValueMap[Inst] = C; - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - C->insertBefore(LoopEntryBranch); } } +} + +// Add incoming values to the (already present) PHIs of NewH. +void LoopRotate::adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH, + BasicBlock *NewPH) const { + for (Instruction &Inst : *NewH) { + PHINode *PN = dyn_cast(&Inst); + if (!PN) + break; + assert((PN->getNumOperands() == 1) && "NewH had multiple predecessors."); + Value *Op = PN->getIncomingValue(0); + if (Value *RenamedVal = VMap[Op]) + PN->addIncoming(RenamedVal, NewPH); + else // When no mapping is available (e.g., in case of a constant). + PN->addIncoming(Op, NewPH); + } +} - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - TerminatorInst *TI = OrigHeader->getTerminator(); - for (BasicBlock *SuccBB : TI->successors()) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - if (!isa(PHBI->getCondition()) || - PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != - NewHeader) { - // The conditional branch can't be folded, handle the general case. - // Update DominatorTree to reflect the CFG change we just made. Then split - // edges as necessary to preserve LoopSimplify form. - if (DT) { - // Everything that was dominated by the old loop header is now dominated - // by the original loop preheader. Conceptually the header was merged - // into the preheader, even though we reuse the actual block as a new - // loop latch. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); - SmallVector HeaderChildren(OrigHeaderNode->begin(), - OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); - for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - - assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); - - // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(OrigHeader, OrigLatch); +// OrigH and OrigLatch represent an SEME region. Collect all BasicBlocks +// (bounded by \p OrigH and \p OrigLatch), which are exiting the loop in +// \p Blocks and collect all the exits from the region in \p Exits Returns the +// first basic block which was not copied. +BasicBlock *LoopRotate::collectSEMEBlocks(BasicBlock *OrigH, + BasicBlock *OrigLatch, + SmallVecBB &Blocks, + SmallPtrSetBB &Exits) const { + BasicBlock *NewH = nullptr; + for (auto BB = df_begin(OrigH), E = df_end(OrigH); BB != E;) { + if (!L->contains(*BB)) { + BB.skipChildren(); + continue; } - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector ExitPreds(pred_begin(Exit), pred_end(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit)) - continue; - if (isa(ExitPred->getTerminator())) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); + // Copy until any BB where the branch does not exit loop, or the loop-latch. + if (OrigLatch == *BB || !L->isLoopExiting(*BB) || + !isa((*BB)->getTerminator())) { + // This will become the new header. + NewH = *BB; + BB.skipChildren(); + } else { + Blocks.push_back(*BB); + + BranchInst *BI = cast((*BB)->getTerminator()); + for (unsigned B = 0, E = BI->getNumSuccessors(); B < E; ++B) { + BasicBlock *Succ = BI->getSuccessor(B); + if (!L->contains(Succ)) + Exits.insert(Succ); + } + ++BB; } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); - - // With our CFG finalized, update DomTree if it is available. - if (DT) { - // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(NewHeader, OrigPreheader); - DT->changeImmediateDominator(OrigHeader, OrigLatch); - - // Brute force incremental dominator tree update. Call - // findNearestCommonDominator on all CFG predecessors of each child of the - // original header. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); - SmallVector HeaderChildren(OrigHeaderNode->begin(), - OrigHeaderNode->end()); - bool Changed; - do { - Changed = false; - for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) { - DomTreeNode *Node = HeaderChildren[I]; - BasicBlock *BB = Node->getBlock(); - - pred_iterator PI = pred_begin(BB); - BasicBlock *NearestDom = *PI; - for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); - - // Remember if this changes the DomTree. - if (Node->getIDom()->getBlock() != NearestDom) { - DT->changeImmediateDominator(BB, NearestDom); - Changed = true; - } - } + } + return NewH; +} - // If the dominator changed, this may have an effect on other - // predecessors, continue until we reach a fixpoint. - } while (Changed); +// Helper function for copySEME. Adjusts the PHIs of all the \p Exits bounding +// an SEME. \p VMap contains mapping of original BB vs copied BB. +static void adjustExitingPhis(ValueToValueMapTy &VMap, + const SmallPtrSetImpl &Exits) { + for (BasicBlock *BB : Exits) { + for (Instruction &Inst : *BB) { + PHINode *PN = dyn_cast(&Inst); + if (!PN) + break; + bool EdgeFromOrigBB = false; + for (unsigned i = 0, e = PN->getNumOperands(); i != e; ++i) { + Value *CopyB = VMap[PN->getIncomingBlock(i)]; + if (!CopyB) // Skip args coming from outside the SEME. + continue; + BasicBlock *CopyBB = cast(CopyB); + EdgeFromOrigBB = true; + Value *Op = PN->getIncomingValue(i); + if (Value *RenamedVal = VMap[Op]) + PN->addIncoming(RenamedVal, CopyBB); + else + // When no mapping is available it must be a constant. + PN->addIncoming(Op, CopyBB); + } + assert(EdgeFromOrigBB && "Illegal exit from SEME."); } } +} - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); +/// Clones the basic blocks (\p Blocks) of an SEME bounded by \p Exits. +/// Blocks[0] is the entry basic block of the SEME. +/// The mapping between original BBs and correponding copied BBs are +/// populated in \p VMap. During copy the DOM and LI of CFG are updated. +/// \returns The entry point of the copied SEME. \p NameSuffix is used to suffix +/// name of the copied BBs. The copied SEME is also an SEME. +static BasicBlock* copyBlocks(const SmallVectorImpl &Blocks, + const SmallPtrSetImpl &Exits, + ValueToValueMapTy &VMap, + const Twine &NameSuffix, + DominatorTree *DT, LoopInfo *LI) { + // Step1: Clone the basic blocks and populate VMap. + BasicBlock *OrigH = Blocks[0]; + + Function *F = OrigH->getParent(); + SmallVector NewBlocks; + for (BasicBlock *BB : Blocks) { + assert(!isa(BB->getTerminator()) && + "Cannot clone SEME with indirect branches."); + + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(), + NewBB); + Loop *BBLoop = LI->getLoopFor(BB); + Loop *BBParentLoop = BBLoop->getParentLoop(); + if (BBParentLoop) + BBParentLoop->addBasicBlockToLoop(NewBB, *LI); + VMap[BB] = NewBB; + NewBlocks.push_back(NewBB); + } - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - MergeBlockIntoPredecessor(OrigHeader, DT, LI); + // Step2: Remap the names in copied BBs. + remapInstructionsInBlocks(NewBlocks, VMap); - DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + // Step3: Redirect the edges. + for (BasicBlock *BB : Blocks) { + BasicBlock *NewBB = cast(VMap[BB]); + BranchInst *BI = dyn_cast(NewBB->getTerminator()); + if (!BI) + continue; - ++NumRotated; - return true; -} + for (unsigned I = 0, E = BI->getNumSuccessors(); I < E; ++I) + if (auto *NewSucc = cast_or_null(VMap[BI->getSuccessor(I)])) + BI->setSuccessor(I, NewSucc); + } -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End, Loop *L) { - bool seenIncrement = false; - bool MultiExitLoop = false; + // Step4: Update the DOM of copied SEME. Except for the entry block its tree + // structure is the same as of original SEME so the dominators also follow the + // same structural property. If the IDom of original BB is not in SEME that + // means it is the entry block, in that case the new IDom of the new BB must + // be its single predecessor because we are dealing with an SEME region. + BasicBlock *EntryNewSEME = nullptr; + if (auto *DomT = DT->getNode(Blocks[0])->getIDom()) { + // Entry to SEME has a dominator, update the copied entry. + BasicBlock *Dom = DomT->getBlock(); + assert(!VMap[Dom]); // Dom does not belong to SEME => entry block. + BasicBlock *NewBB = cast(VMap[Blocks[0]]); + EntryNewSEME = NewBB; + DT->addNewBlock(NewBB, Dom); + DT->changeImmediateDominator(NewBB, Dom); + } - if (!L->getExitingBlock()) - MultiExitLoop = true; + for (auto BBI = std::next(Blocks.begin()); BBI != Blocks.end(); ++BBI) { + BasicBlock *NewBB = cast(VMap[*BBI]); + BasicBlock *Dom = DT->getNode(*BBI)->getIDom()->getBlock(); + BasicBlock *NewDom = cast(VMap[Dom]); + DT->addNewBlock(NewBB, NewDom); + DT->changeImmediateDominator(NewBB, NewDom); + } - for (BasicBlock::iterator I = Begin; I != End; ++I) { + // Step5: Adjust PHI nodes. + adjustExitingPhis(VMap, Exits); + return EntryNewSEME; +} - if (!isSafeToSpeculativelyExecute(&*I)) - return false; +/// Rotate loop L. +/// TODO: arrange newly inserted bbs. +void LoopRotate::rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const { + BasicBlock *OrigH = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + BasicBlock *OrigPH = L->getLoopPreheader(); - if (isa(I)) - continue; + DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - switch (I->getOpcode()) { - default: - return false; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast(I)->hasAllConstantIndices()) - return false; - // fall-thru to increment case - LLVM_FALLTHROUGH; - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: { - Value *IVOpnd = - !isa(I->getOperand(0)) - ? I->getOperand(0) - : !isa(I->getOperand(1)) ? I->getOperand(1) : nullptr; - if (!IVOpnd) - return false; - - // If increment operand is used outside of the loop, this speculation - // could cause extra live range interference. - if (MultiExitLoop) { - for (User *UseI : IVOpnd->users()) { - auto *UserInst = cast(UseI); - if (!L->contains(UserInst)) - return false; - } - } + ValueToValueMapTy VMap; + copyBlocks(Blocks, Exits, VMap, ".lr", DT, LI); + + // Redirect original preheader to the entry of SEME. + BranchInst *PBI = dyn_cast(OrigPH->getTerminator()); + assert(PBI && (1 == PBI->getNumSuccessors()) && + "Preheader does not have single successor."); - if (seenIncrement) - return false; - seenIncrement = true; + BasicBlock *CopyOrigH = cast(VMap[OrigH]); + PBI->setSuccessor(0, CopyOrigH); + DT->changeImmediateDominator(CopyOrigH, OrigPH); + L->moveToHeader(NewH); + + BasicBlock *BeforeLoop = nullptr; + for (BasicBlock *BB : predecessors(NewH)) + if (!L->contains(BB)) { + BeforeLoop = BB; break; } - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - // ignore type conversions - break; + assert(BeforeLoop && "No entry point to the loop from New Header."); + Function *F = BeforeLoop->getParent(); + // Move NewH physically to the beginning of the loop. + F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(), + NewH); + + // SplitEdge does not work properly with single-entry PHIs. + BasicBlock *NewPH = BasicBlock::Create( + NewH->getContext(), NewH->getName() + ".lr.ph", F, BeforeLoop); + + Loop *OuterLoop = LI->getLoopFor(OrigPH); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(NewPH, *LI); + + // BeforeLoop --> NewPH --> NewH. + insertBetween(NewPH, BeforeLoop, NewH); + + DT->addNewBlock(NewPH, BeforeLoop); + DT->changeImmediateDominator(NewPH, BeforeLoop); + DT->changeImmediateDominator(NewH, NewPH); + + // Also, the original entry lost its immediate dominator so its dominator + // should be adjusted. We use SEME property => idom (OrigH) = its single pred. + DT->changeImmediateDominator(OrigH, OrigH->getSinglePredecessor()); + + for (BasicBlock *BB : Blocks) { + typedef DomTreeNodeBase DTNode; + DTNode *IDom = DT->getNode(BB); + std::vector::iterator I = IDom->begin(); + for (; I != IDom->end();) { + BasicBlock *ExitBB = (*I)->getBlock(); + if (L->contains(ExitBB)) { + ++I; + continue; + } + + BasicBlock *StaleIDom = DT->getNode(ExitBB)->getIDom()->getBlock(); + BasicBlock *NewBB = cast(VMap[BB]); + // NewIDom is correct because this part of CFG is up-to-date. + BasicBlock *NewIDom = DT->findNearestCommonDominator(StaleIDom, NewBB); + NewIDom = DT->findNearestCommonDominator(NewIDom, BB); + if (NewIDom != StaleIDom) { + DT->changeImmediateDominator(ExitBB, NewIDom); + DEBUG(dbgs() << "\nChanging IDom of " << *ExitBB << "to" << *NewIDom); + I = IDom->begin(); + } else + ++I; } } - return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || Latch->hasAddressTaken()) - return false; - - BranchInst *Jmp = dyn_cast(Latch->getTerminator()); - if (!Jmp || !Jmp->isUnconditional()) - return false; - BasicBlock *LastExit = Latch->getSinglePredecessor(); - if (!LastExit || !L->isLoopExiting(LastExit)) - return false; + adjustNewHeaderPhis(VMap, NewH, NewPH); - BranchInst *BI = dyn_cast(LastExit->getTerminator()); - if (!BI) - return false; + BasicBlock *NewLatch = L->getLoopLatch(); + assert(L->getLoopPreheader() && "Invalid loop preheader after rotation"); + assert(NewLatch && "Invalid loop latch after rotation"); - if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) - return false; + addNewPhisToNewHeader(Blocks, NewH, NewPH, NewLatch, VMap); - DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); + // Discard incoming values in the CopyOrigHeader, which are coming from + // OrigLatch since it has only one predecessor. + discardIncomingValues(CopyOrigH, OrigLatch); + discardIncomingValues(OrigH, OrigPH); - // Hoist the instructions from Latch into LastExit. - LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), - Latch->begin(), Jmp->getIterator()); + DEBUG(LI->verify(*DT)); + DEBUG(DT->verifyDomTree()); + assert(L->isRecursivelyLCSSAForm(*DT) && "Loop is not in LCSSA form."); - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); + DEBUG(dbgs() << "\nLoopRotation: rotated "; L->dumpVerbose()); - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); + assert(isSingleEntryMultipleExit(L->getHeader(), NewLatch, DT) && + "Rotated loop not an SEME"); + assert(isSingleEntryMultipleExit(CopyOrigH, NewPH, DT) && + "Copied SEME not an SEME"); - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); - return true; + ++NumRotated; } /// Rotate \c L, and return true if any modification was made. -bool LoopRotate::processLoop(Loop *L) { +bool LoopRotate::processLoop() { + int Times = 0; + if (auto Value = findStringMetadataForLoop(L, LoopRotatedTimes)) { + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa(*Op) && "invalid metadata"); + Times = mdconst::extract(*Op)->getZExtValue(); + if (Times >= RotationsMaxAllowed) + return false; + } + + if (!isLegalToRotate()) + return false; + + BasicBlock *OrigH = L->getHeader(); + BasicBlock *LoopLatch = L->getLoopLatch(); + + // Basic blocks to be copied. + SmallVector Blocks; + SmallPtrSet Exits; + // Collect all nodes of the loop from header to latch. + BasicBlock *NewH = collectSEMEBlocks(OrigH, LoopLatch, Blocks, Exits); + assert(NewH && "Invalid SEME region."); + + if (!isProfitableToRotate(Blocks, Exits)) + return false; + // Save the loop metadata. - MDNode *LoopMD = L->getLoopID(); + MDNode *LoopMD = nullptr; + Instruction *TI = nullptr; + std::tie(LoopMD, TI) = L->getLoopIDWithInstr(); + + // Make sure the latch has only one successor. + if (!LoopLatch->getSingleSuccessor()) { + // Since LoopLatch was skipped while collecting SEME blocks, exits out of + // the loop-latch is collected here. + BranchInst *BLI = dyn_cast(LoopLatch->getTerminator()); + if (!BLI) + return false; + // If the NewH is the loop-latch => Exits does not include the exiting block + // from loop-latch see (collectSEMEBlocks). + if (NewH == LoopLatch) + for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I) { + BasicBlock *Succ = BLI->getSuccessor(I); + if (Succ != OrigH && !L->contains(Succ)) + Exits.insert(Succ); + } + + // The old loop-latch will become the parent of new loop-latch. + DEBUG(dbgs() << "\nSplitting the edge of Loop:"; L->dumpVerbose();); + BasicBlock *NewLoopLatch = SplitEdge(LoopLatch, OrigH, DT, LI); + + // If the NewH is the loop-latch => NewH would change as the loop-latch has + // changed after splitting. Also, we need to add the (old) loop-latch to the + // blocks because it was skipped (all the blocks before NewH are copied to + // new SEME). + if (NewH == LoopLatch) { + NewH = NewLoopLatch; + Blocks.push_back(LoopLatch); + } + LoopLatch = NewLoopLatch; + } - // Simplify the loop latch before attempting to rotate the header - // upward. Rotation may not be needed if the loop tail can be folded into the - // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L); + assert(LoopLatch->getSingleSuccessor() && "Invalid SEME region."); - bool MadeChange = rotateLoop(L, SimplifiedLatch); - assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && + // Anything ScalarEvolution may know about this loop or the PHI nodes in its + // header will soon be invalidated. + if (SE) + SE->forgetLoop(L); + + rotateLoop(NewH, Blocks, Exits); + assert(L->isLoopExiting(L->getLoopLatch()) && "Loop latch should be exiting after loop-rotate."); - // Restore the loop metadata. - // NB! We presume LoopRotation DOESN'T ADD its own metadata. - if ((MadeChange || SimplifiedLatch) && LoopMD) + if (LoopMD) { + // Drop the metadata from the old loop header. + SmallVector MDs; + TI->getAllNonDebugMetadataIDs(MDs); + MDs.erase(llvm::remove_if(MDs, [](unsigned ID) { + return ID == LLVMContext::MD_loop; }), MDs.end()); + + TI->dropUnknownNonDebugMetadata(MDs); + + // Restore the loop metadata to the new loop header. L->setLoopID(LoopMD); + } - return MadeChange; + // Add new metadata that loop has been rotated once. + // The metadata reflects how many times loop has been rotated. + addStringMetadataToLoop(L, LoopRotatedTimes, Times + 1); + return true; } LoopRotatePass::LoopRotatePass() {} @@ -632,9 +665,9 @@ // Optional analyses. auto *DT = FAM.getCachedResult(*F); auto *SE = FAM.getCachedResult(*F); - LoopRotate LR(DefaultRotationThreshold, LI, TTI, AC, DT, SE); + LoopRotate LR(LI, TTI, AC, DT, SE, &L); - bool Changed = LR.processLoop(&L); + bool Changed = LR.processLoop(); if (!Changed) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); @@ -643,22 +676,17 @@ namespace { class LoopRotateLegacyPass : public LoopPass { - unsigned MaxHeaderSize; - public: static char ID; // Pass ID, replacement for typeid - LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + LoopRotateLegacyPass() : LoopPass(ID) { initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); } // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -674,8 +702,8 @@ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *SEWP = getAnalysisIfAvailable(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; - LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE); - return LR.processLoop(L); + LoopRotate LR(LI, TTI, AC, DT, SE, L); + return LR.processLoop(); } }; } @@ -685,10 +713,9 @@ false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, false) -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { - return new LoopRotateLegacyPass(MaxHeaderSize); -} +Pass *llvm::createLoopRotatePass() { return new LoopRotateLegacyPass(); } Index: llvm/test/Analysis/GlobalsModRef/memset-escape.ll =================================================================== --- llvm/test/Analysis/GlobalsModRef/memset-escape.ll +++ llvm/test/Analysis/GlobalsModRef/memset-escape.ll @@ -6,15 +6,19 @@ @a = internal global [3 x i32] zeroinitializer, align 4 @b = common global i32 0, align 4 -; The important thing we're checking for here is the reload of (some element of) -; @a after the memset. +; Check that load and the call to abort is redundant. +; CHECK: store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4 +; CHECK: store i32 0, i32* @b, align 4 +; CHECK: br label %for.body -; CHECK-LABEL: @main -; CHECK: call void @llvm.memset.p0i8.i64{{.*}} @a -; CHECK: store i32 3 -; CHECK: load i32, i32* getelementptr {{.*}} @a -; CHECK: icmp eq i32 -; CHECK: br i1 +; CHECK: for.body: ; preds = %for.body.preheader +; CHECK: store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 0), align 4 +; CHECK: store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4 +; CHECK: store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4 +; CHECK: store i32 3, i32* @b, align 4 +; CHECK: br i1 true, label %if.end, label %if.then +; CHECK-NOT: load +; CHECK-NOT: call void @abort() define i32 @main() { entry: Index: llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll +++ llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -instcombine -indvars -loop-deletion -constmerge -S | FileCheck %s +; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -simplifycfg -S | FileCheck %s + ; PR11882: ComputeLoadConstantCompareExitLimit crash. ; ; for.body is deleted leaving a loop-invariant load. Index: llvm/test/Transforms/LoopRotate/alloca.ll =================================================================== --- llvm/test/Transforms/LoopRotate/alloca.ll +++ llvm/test/Transforms/LoopRotate/alloca.ll @@ -5,8 +5,9 @@ ; We expect a different value for %ptr each iteration (according to the ; definition of alloca). I.e. each @use must be paired with an alloca. -; CHECK: call void @use(i8* % -; CHECK: %ptr = alloca i8 +; CHECK: alloca i8 +; CHECK: call void @use +; CHECK: alloca i8 @e = global i16 10 Index: llvm/test/Transforms/LoopRotate/basic.ll =================================================================== --- llvm/test/Transforms/LoopRotate/basic.ll +++ llvm/test/Transforms/LoopRotate/basic.ll @@ -18,9 +18,10 @@ %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 br i1 %cmp, label %for.body, label %for.end -; CHECK: for.body: +; CHECK: for.cond.lr: ; CHECK-NEXT: phi i32 [ 0 -; CHECK-NEXT: store i32 0 +; CHECK: for.body: +; CHECK: store i32 0 for.body: ; preds = %for.cond store i32 0, i32* %arrayidx, align 16 Index: llvm/test/Transforms/LoopRotate/dbgvalue.ll =================================================================== --- llvm/test/Transforms/LoopRotate/dbgvalue.ll +++ llvm/test/Transforms/LoopRotate/dbgvalue.ll @@ -5,9 +5,9 @@ define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp !dbg !0 { ; CHECK-LABEL: define i32 @tak( -; CHECK: entry -; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %x -; CHECK: tail call void @llvm.dbg.value(metadata i32 %call +; CHECK: tailrecurse.lr +; CHECK: call void @llvm.dbg.value(metadata i32 %x +; CHECK: tail call void @llvm.dbg.value(metadata i32 %y entry: br label %tailrecurse @@ -42,7 +42,7 @@ ; CHECK-LABEL: define i32 @tak2( ; CHECK: entry ; CHECK: tail call void @llvm.dbg.value(metadata i32 %x.tr -; CHECK: tail call void @llvm.dbg.value(metadata i32 undef +; CHECK: tail call void @llvm.dbg.value(metadata i32 %y.tr entry: br label %tailrecurse @@ -83,12 +83,13 @@ ; Ensure that the loop increment basic block is rotated into the tail of the ; body, even though it contains a debug intrinsic call. ; CHECK-LABEL: define void @FindFreeHorzSeg( +; CHECK: for.inc: +; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ] ; CHECK: %dec = add ; CHECK-NEXT: tail call void @llvm.dbg.value +; CHECK-NEXT: br label %for.cond ; CHECK: %cmp = icmp ; CHECK: br i1 %cmp -; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ] -; CHECK-NEXT: br label %for.end entry: Index: llvm/test/Transforms/LoopRotate/indirectbr-1.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/indirectbr-1.ll @@ -0,0 +1,77 @@ +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s +; ModuleID = 'ind.c' +source_filename = "ind.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" +; Check that the loop-latch with indirect branch is not rotated. +; CHECK-NOT: {{.*}}.lr +; CHECK-NOT: {{.*}}.lr.ph + +@f.x = internal global [3 x i8*] [i8* blockaddress(@f, %F), i8* blockaddress(@f, %G), i8* blockaddress(@f, %H)], align 16 + +; Function Attrs: nounwind uwtable +define i32 @f(i32 %i, i32 %j, i32 %k) { +entry: + %retval = alloca i32, align 4 + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + %k.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + store i32 %k, i32* %k.addr, align 4 + br label %F + +F: ; preds = %entry, %indirectgoto + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %cmp = icmp sgt i32 %0, %1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %F + br label %G + +if.else: ; preds = %F + %2 = load i32, i32* %i.addr, align 4 + %3 = load i32, i32* %k.addr, align 4 + %cmp1 = icmp sgt i32 %2, %3 + br i1 %cmp1, label %if.then2, label %if.end + +if.then2: ; preds = %if.else + br label %H + +if.end: ; preds = %if.else + br label %if.end3 + +if.end3: ; preds = %if.end + %call = call i32 (...) @z() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds [3 x i8*], [3 x i8*]* @f.x, i64 0, i64 %idxprom + %4 = load i8*, i8** %arrayidx, align 8 + br label %indirectgoto + +G: ; preds = %if.then, %indirectgoto + %call4 = call i32 (...) @g() + store i32 %call4, i32* %retval, align 4 + br label %return + +H: ; preds = %if.then2, %indirectgoto + %call5 = call i32 (...) @h() + store i32 %call5, i32* %retval, align 4 + br label %return + +return: ; preds = %H, %G + %5 = load i32, i32* %retval, align 4 + ret i32 %5 + +indirectgoto: ; preds = %if.end3 + %indirect.goto.dest = phi i8* [ %4, %if.end3 ] + indirectbr i8* %indirect.goto.dest, [label %F, label %G, label %H] +} + +declare i32 @g(...) +declare i32 @h(...) +declare i32 @z(...) + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.9.0 "} Index: llvm/test/Transforms/LoopRotate/loop-rotate-before-latch.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/loop-rotate-before-latch.ll @@ -0,0 +1,46 @@ +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info -loop-rotate | FileCheck %s -check-prefix=CHECK1 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL:foo +; CHECK: "llvm.loop.rotated.times", i32 1 + +; Check that the loop is rotated only once despite multiple invocations of loop-rotate on a loop. +; CHECK1-LABEL:foo +; CHECK1: "llvm.loop.rotated.times", i32 1 + +; Function Attrs: nounwind uwtable +define hidden void @foo(i32 %arg1, i32 %arg3) local_unnamed_addr { + +entry: + br label %bb25 + +bb25: ; preds = %bb140, %entry + br i1 undef, label %bb165, label %bb34 + +bb34: ; preds = %bb25 + switch i32 undef, label %bb35 [ + i32 46, label %bb36 + i32 32, label %bb36 + ] + +bb35: ; preds = %bb34 + unreachable + +bb36: ; preds = %bb34, %bb34 + br label %bb140 + +bb140: ; preds = %bb36 + %tmp145 = add nsw i32 undef, undef + br i1 undef, label %bb25, label %bb146 + +bb146: ; preds = %bb140 + %tmp147 = icmp sgt i32 %tmp145, %arg1 + unreachable + +bb165: ; preds = %bb25 + unreachable +} + Index: llvm/test/Transforms/LoopRotate/loop-rotate.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/loop-rotate.ll @@ -0,0 +1,415 @@ +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK1 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK2 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK3 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK4 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK5 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK6 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK7 %s +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK8 %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Check that the loop is rotated. +; CHECK-LABEL: bar0 +; CHECK: while.cond.lr: ; preds = %entry +; CHECK: while.body.lr.ph: ; preds = %while.cond.lr + +%fp = type { i64 } + +declare void @foo0(%fp* %this, %fp* %that) + +define void @bar0(%fp* %__begin1, %fp* %__end1, %fp** %__end2) { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %__end1.addr.0 = phi %fp* [ %__end1, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq %fp* %__end1.addr.0, %__begin1 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %0 = load %fp*, %fp** %__end2, align 8 + %add.ptr = getelementptr inbounds %fp, %fp* %0, i64 -1 + %incdec.ptr = getelementptr inbounds %fp, %fp* %__end1.addr.0, i64 -1 + tail call void @foo0(%fp* %add.ptr, %fp* %incdec.ptr) + %1 = load %fp*, %fp** %__end2, align 8 + %incdec.ptr2 = getelementptr inbounds %fp, %fp* %1, i64 -1 + store %fp* %incdec.ptr2, %fp** %__end2, align 8 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + + +; Check that the loop is rotated. +; CHECK1-LABEL: bar1 +; CHECK1: for.cond.lr: ; preds = %entry +; CHECK1: if.end.lr: ; preds = %for.cond.lr +; CHECK1: if.end75.lr: ; preds = %if.end.lr +; CHECK1: if.end94.lr: ; preds = %if.end75.lr +; CHECK1: if.then178.lr.ph: ; preds = %if.end94.lr + +declare i32 @foo1() +define i32 @bar1() { +entry: + br label %for.cond + +for.cond: ; preds = %if.then178, %entry + %first.0 = phi i32 [ 1, %entry ], [ 0, %if.then178 ] + br i1 undef, label %err_sl, label %if.end + +if.end: ; preds = %for.cond + br i1 undef, label %err_sl, label %if.end75 + +if.end75: ; preds = %if.end + br i1 undef, label %if.end94, label %if.then93 + +if.then93: ; preds = %if.end75 + unreachable + +if.end94: ; preds = %if.end75 + br i1 undef, label %if.then178, label %for.end182 + +if.then178: ; preds = %if.end94 + %call179 = tail call i32 @foo1() + br label %for.cond + +for.end182: ; preds = %if.end94 + ret i32 1 + +err_sl: ; preds = %if.end, %for.cond + unreachable +} + + +; Check that the loop is rotated and multiple phis are updated. +; CHECK2-LABEL: foo2 +; CHECK2: do.body.i10.lr: ; preds = %if.else.i3 +; CHECK2: do.cond.i19.lr.ph: ; preds = %do.body.i10.lr + +define void @foo2() { +entry: + br i1 undef, label %for.body, label %for.end + +for.body: ; preds = %entry + br i1 undef, label %inverse.exit21, label %if.else.i3 + +if.else.i3: ; preds = %for.body + br label %do.body.i10 + +do.body.i10: ; preds = %do.cond.i19, %if.else.i3 + %b1.0.i4 = phi i64 [ 0, %if.else.i3 ], [ %b2.0.i7, %do.cond.i19 ] + %b2.0.i7 = phi i64 [ 1, %if.else.i3 ], [ %sub8.i18, %do.cond.i19 ] + br i1 undef, label %do.cond.thread.i15, label %do.cond.i19 + +do.cond.thread.i15: ; preds = %do.body.i10 + br label %inverse.exit21 + +do.cond.i19: ; preds = %do.body.i10 + %mul.i17 = mul nsw i64 undef, %b2.0.i7 + %sub8.i18 = sub nsw i64 %b1.0.i4, %mul.i17 + br label %do.body.i10 + +inverse.exit21: ; preds = %do.cond.thread.i15, %for.body + br label %for.end + +for.end: ; preds = %inverse.exit21, %entry + ret void +} + +; Check that the loop with indirect branches is rotated. +; CHECK3-LABEL: foo3 +; CHECK3: while.cond.lr: ; preds = %entry +; CHECK3: if.end6.lr: ; preds = %while.cond.lr +; CHECK3: if.else.lr: ; preds = %if.end6.lr +; CHECK3: if.end2.i.lr: ; preds = %if.else.lr +; CHECK3: if.end37.lr: ; preds = %if.end2.i.lr +; CHECK3: if.else96.lr: ; preds = %if.end37.lr +; CHECK3: if.else106.lr.ph: ; preds = %if.else96.lr + +define i32 @foo3() { +entry: + br label %while.cond + +while.cond: ; preds = %if.then467, %entry + br i1 undef, label %if.then, label %if.end6 + +if.then: ; preds = %while.cond + unreachable + +if.end6: ; preds = %while.cond + %conv7 = ashr exact i64 undef, 32 + br i1 undef, label %if.else, label %if.then19 + +if.then19: ; preds = %if.end6 + unreachable + +if.else: ; preds = %if.end6 + br i1 undef, label %thr, label %if.end2.i + +if.end2.i: ; preds = %if.else + br i1 undef, label %thr, label %if.end37 + +thr: ; preds = %if.end2.i, %if.else + unreachable + +if.end37: ; preds = %if.end2.i + br i1 undef, label %if.else96, label %if.then40 + +if.then40: ; preds = %if.end37 + unreachable + +if.else96: ; preds = %if.end37 + br i1 undef, label %if.else106, label %if.then99 + +if.then99: ; preds = %if.else96 + unreachable + +if.else106: ; preds = %if.else96 + switch i32 undef, label %if.else427 [ + i32 12, label %if.then130 + i32 30, label %if.then467 + ] + +if.then130: ; preds = %if.else106 + br label %if.then467 + +if.else427: ; preds = %if.else106 + unreachable + +if.then467: ; preds = %if.then130, %if.else106 + br label %while.cond +} + +; Check that the loop is rotated. +; CHECK4-LABEL: foo4 +; CHECK4: bb3.lr: ; preds = %bb +; CHECK4: bb5.lr: ; preds = %bb3.lr +; CHECK4: bb8.lr.ph: ; preds = %bb5.lr + +define void @foo4(i8* %arg, i8* %arg1, i64 %arg2) { +bb: + %tmp = and i32 undef, 7 + br label %bb3 + +bb3: ; preds = %bb8, %bb + %tmp4 = phi i32 [ 0, %bb8 ], [ %tmp, %bb ] + br i1 undef, label %bb9, label %bb5 + +bb5: ; preds = %bb3 + %tmp6 = sub nsw i32 8, %tmp4 + br i1 undef, label %bb7, label %bb8 + +bb7: ; preds = %bb5 + unreachable + +bb8: ; preds = %bb5 + store i32 undef, i32* undef, align 8 + br label %bb3 + +bb9: ; preds = %bb3 + ret void +} + + +; Check that the loop with pre-header and loop-body having a switch block is rotated. +; CHECK5-LABEL: bar5 +; CHECK5: bb19.preheader: ; preds = %bb10, %bb10 +; CHECK5: bb19.lr: ; preds = %bb19.preheader +; CHECK5: bb12.lr.ph: ; preds = %bb19.lr + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i32 @foo5(i8*, i32) + +define i32 @bar5(i8* %arg, i32 %arg1, i8* %arg2, i32 %arg3) { +bb6: + br i1 undef, label %bb8, label %bb10 + +bb8: ; preds = %bb6 + unreachable + +bb10: ; preds = %bb6 + switch i32 undef, label %bb11 [ + i32 46, label %bb19 + i32 32, label %bb19 + ] + +bb11: ; preds = %bb10 + unreachable + +bb12: ; preds = %bb19 + switch i8 undef, label %bb13 [ + i8 32, label %bb21 + i8 46, label %bb21 + ] + +bb13: ; preds = %bb12 + br label %bb15 + +bb15: ; preds = %bb13 + br i1 undef, label %bb17, label %bb16 + +bb16: ; preds = %bb15 + %tmp = call i32 @foo5(i8* %tmp20, i32 10) + ret i32 0 + +bb17: ; preds = %bb15 + %tmp18 = phi i8* [ %tmp20, %bb15 ] + br label %bb19 + +bb19: ; preds = %bb17, %bb10, %bb10 + %tmp20 = phi i8* [ %tmp18, %bb17 ], [ null, %bb10 ], [ null, %bb10 ] + br i1 undef, label %bb21, label %bb12 + +bb21: ; preds = %bb19, %bb12, %bb12 + unreachable +} + + +; Check that the loop is rotated. +; CHECK6-LABEL: bar6 +; CHECK6: while.cond.preheader: ; preds = %if.end92 +; CHECK6: while.cond.lr: ; preds = %while.cond.preheader +; CHECK6: if.end107.lr.ph: ; preds = %while.cond.lr + +; Function Attrs: nounwind uwtable +define i32 @bar6() { +entry: + br i1 undef, label %if.then, label %if.end + +if.then: ; preds = %entry + ret i32 -1 + +if.end: ; preds = %entry + br i1 undef, label %if.then39, label %if.end48 + +if.then39: ; preds = %if.end + unreachable + +if.end48: ; preds = %if.end + br i1 undef, label %if.else61, label %if.then59 + +if.then59: ; preds = %if.end48 + unreachable + +if.else61: ; preds = %if.end48 + br i1 undef, label %if.then91, label %if.end92 + +if.then91: ; preds = %if.else61 + unreachable + +if.end92: ; preds = %if.else61 + br i1 undef, label %if.then96, label %while.cond + +if.then96: ; preds = %if.end92 + unreachable + +while.cond: ; preds = %if.end107, %if.end92 + %len.3 = phi i64 [ %add109, %if.end107 ], [ 0, %if.end92 ] + br i1 undef, label %if.then106, label %if.end107 + +if.then106: ; preds = %while.cond + unreachable + +if.end107: ; preds = %while.cond + %add109 = add i64 undef, %len.3 + %sub111 = sub i64 undef, undef + br label %while.cond +} + + +declare i8* @foo7() + +; Check that the loop is rotated. +; CHECK7-LABEL: bar7 +; CHECK7: for.cond.preheader: ; preds = %if.end +; CHECK7: for.cond.lr: ; preds = %for.cond.preheader +; CHECK7: for.body.lr: ; preds = %for.cond.lr +; CHECK7: if.end20.lr.ph: ; preds = %for.body.lr + +define void @bar7() #1 { +entry: + br i1 undef, label %if.then, label %if.end + +if.then: ; preds = %entry + unreachable + +if.end: ; preds = %entry + br i1 undef, label %return, label %for.cond + +for.cond: ; preds = %for.inc, %if.end + br i1 undef, label %for.body, label %return + +for.body: ; preds = %for.cond + %call15 = call i8* @foo7() + br i1 undef, label %if.end20, label %return + +if.end20: ; preds = %for.body + %issuer.i = getelementptr inbounds i8, i8* %call15, i64 24 + br i1 undef, label %if.then.i, label %for.cond.i + +if.then.i: ; preds = %if.end20 + br i1 undef, label %if.then23, label %crl + +for.cond.i: ; preds = %if.end20 + br i1 undef, label %for.body.i, label %for.inc + +for.body.i: ; preds = %for.cond.i + unreachable + +crl: ; preds = %if.then.i + br i1 undef, label %if.then23, label %for.inc + +if.then23: ; preds = %crl, %if.then.i + %reason = getelementptr inbounds i8, i8* %call15, i64 32 + br label %return + +for.inc: ; preds = %crl, %for.cond.i + br label %for.cond + +return: ; preds = %if.then23, %for.body, %for.cond, %if.end + ret void +} + + +; Check that the loop with an if-block within the loop is rotated. +; There are multiple exits from the loop. + +; CHECK8-LABEL: foo8 +; CHECK8: for.cond.lr: ; preds = %entry +; CHECK8: %first.0.lr = phi i32 [ 1, %entry ] +; CHECK8: if.end.lr: ; preds = %for.cond.lr +; CHECK8: if.end75.lr: ; preds = %if.end.lr +; CHECK8: if.end94.lr.ph: ; preds = %if.end75.lr + +define i32 @foo8() { +entry: + br label %for.cond + +for.cond: ; preds = %if.end94, %entry + %first.0 = phi i32 [ 1, %entry ], [ 0, %if.end94 ] + br i1 undef, label %err_sl, label %if.end + +if.end: ; preds = %for.cond + br i1 undef, label %err_sl, label %if.end75 + +if.end75: ; preds = %if.end + br i1 undef, label %if.end94, label %if.then93 + +if.then93: ; preds = %if.end75 + ret i32 0 + +if.end94: ; preds = %if.end75 + %call179 = tail call i32 @gets() + br label %for.cond + +err_sl: ; preds = %if.end, %for.cond + unreachable +} + +declare i32 @gets() Index: llvm/test/Transforms/LoopRotate/multiple-exits-merge-phi.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/multiple-exits-merge-phi.ll @@ -0,0 +1,40 @@ +; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s +; ModuleID = 'bugpoint-reduced-simplified.bc' +source_filename = "bugpoint-output-fdebd3b.bc" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind readonly +define void @test1() #0 { +entry: + br label %for.cond + +for.cond: ; preds = %for.cond1, %entry + %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ] + br i1 undef, label %for.cond1, label %return + +for.cond1: ; preds = %land.rhs, %for.cond + %sum.1 = phi i32 [ 0, %land.rhs ], [ %sum.0, %for.cond ] + br i1 undef, label %land.rhs, label %for.cond + +land.rhs: ; preds = %for.cond1 + br i1 undef, label %return, label %for.cond1 + +return: ; preds = %land.rhs, %for.cond + %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ] + ret void +} + +attributes #0 = { nounwind readonly } + +; check that loop with multiple exit is rotated and has a preheader. +; CHECK-LABEL: test1 +; CHECK: entry: +; CHECK: br label %for.cond.lr +; CHECK: for.cond.loopexit: +; CHECK: br label %for.cond +; CHECK: for.cond.lr: +; CHECK: br i1 false, label %for.cond1.preheader.lr.ph, label %return.loopexit1 +; CHECK: for.cond1.preheader.lr.ph: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: Index: llvm/test/Transforms/LoopRotate/multiple-exits.ll =================================================================== --- llvm/test/Transforms/LoopRotate/multiple-exits.ll +++ llvm/test/Transforms/LoopRotate/multiple-exits.ll @@ -3,7 +3,18 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; PR7447 +; PR7447: there should be two loops rotated. +; CHECK-LABEL: @test1 + +; Check that the outer loop is rotated. +; CHECK: for.cond.lr: +; CHECK: for.cond1.preheader.lr.ph: + +; Check that the inner loop is rotated. +; CHECK: for.cond1.lr: +; CHECK: land.rhs.lr: +; CHECK: land.rhs.for.cond1_crit_edge.lr.ph: + define i32 @test1([100 x i32]* nocapture %a) nounwind readonly { entry: br label %for.cond @@ -31,17 +42,6 @@ return: ; preds = %for.cond, %land.rhs %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ] ret i32 %retval.0 - -; CHECK-LABEL: @test1( -; CHECK: for.cond1.preheader: -; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ] -; CHECK: br label %for.cond1 - -; CHECK: for.cond1: -; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ] -; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ] -; CHECK: %cmp2 = icmp ult i32 %i.1, 100 -; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit } define void @test2(i32 %x) nounwind { @@ -75,9 +75,9 @@ ; CHECK-LABEL: @test2( ; CHECK: if.end: -; CHECK: %inc = add i32 %i.02, 1 -; CHECK: %cmp = icmp eq i32 %inc, %x -; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body +; CHECK: %inc = add i32 %phi.nh, 1 +; CHECK: %cmp = icmp eq i32 %i.0, %x +; CHECK: br i1 %cmp, label %return.loopexit, label %for.body } declare i32 @foo(i32) Index: llvm/test/Transforms/LoopRotate/nosimplifylatch.ll =================================================================== --- llvm/test/Transforms/LoopRotate/nosimplifylatch.ll +++ llvm/test/Transforms/LoopRotate/nosimplifylatch.ll @@ -3,7 +3,8 @@ target triple = "arm64-apple-ios8.0.0" ;CHECK: for.inc: -;CHECK-NEXT: %incdec.ptr.i = getelementptr +;CHECK: %incdec.ptr.i = getelementptr +;CHECK: br ; Function Attrs: alwaysinline inlinehint nounwind readonly ssp define linkonce_odr hidden i64 @_ZNSt3__14findINS_11__wrap_iterIPiEEiEET_S4_S4_RKT0_(i64 %__first.coerce, i64 %__last.coerce, i32* nocapture readonly dereferenceable(4) %__value_) { Index: llvm/test/Transforms/LoopRotate/phi-duplicate.ll =================================================================== --- llvm/test/Transforms/LoopRotate/phi-duplicate.ll +++ llvm/test/Transforms/LoopRotate/phi-duplicate.ll @@ -31,10 +31,15 @@ ; Should only end up with one phi. ; CHECK-LABEL: define void @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label %for.body +; CHECK-NEXT: br label %for.cond.lr ; CHECK: for.body: -; CHECK-NEXT: %j.01 = phi i64 -; CHECK-NOT: br -; CHECK: br i1 %cmp, label %for.body, label %for.end -; CHECK: for.end: -; CHECK-NEXT: ret void +; CHECK: %phi.nh3 = phi i64 +; CHECK: %phi.nh2 = phi i64 +; CHECK: %phi.nh1 = phi i64 +; CHECK: %phi.nh = phi i64 +; CHECK: %inc = add nsw i64 %phi.nh, 1 +; CHECK: br label %for.cond +; CHECK: for.cond: +; CHECK: br i1 %cmp +; CHECK: for.end: +; CHECK-NEXT: ret void Index: llvm/test/Transforms/LoopRotate/pr7447.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/pr7447.ll @@ -0,0 +1,34 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-rotate -stats 2>&1 | FileCheck %s + +; PR7447: there should be two loops rotated. +; CHECK: 2 loop-rotate + +define i32 @test1([100 x i32]* nocapture %a) nounwind readonly { +entry: + br label %for.cond + +for.cond: ; preds = %for.cond1, %entry + %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ] + %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ] + br i1 %i.0, label %for.cond1, label %return + +for.cond1: ; preds = %for.cond, %land.rhs + %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ] + %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ] + %cmp2 = icmp ult i32 %i.1, 100 + br i1 %cmp2, label %land.rhs, label %for.cond + +land.rhs: ; preds = %for.cond1 + %conv = zext i32 %i.1 to i64 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %a, i64 0, i64 %conv + %0 = load i32, i32* %arrayidx, align 4 + %add = add i32 %0, %sum.1 + %cmp4 = icmp ugt i32 %add, 1000 + %inc = add i32 %i.1, 1 + br i1 %cmp4, label %return, label %for.cond1 + +return: ; preds = %for.cond, %land.rhs + %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ] + ret i32 %retval.0 +} Index: llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll =================================================================== --- llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll +++ llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll @@ -16,22 +16,28 @@ ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER:[^,]*]], label %outer.body ; CHECK: [[INNER_PREROTATE_PREHEADER]]: -; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER_SPLIT_RETURN:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]] +; CHECK: br label %inner.header.lr + +; CHECK: inner.header.lr: +; CHECK: br i1 true, label %return, label %inner.body.lr + +; CHECK: inner.body.lr: +; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]] ; CHECK: [[INNER_ROTATED_PREHEADER]]: -; CHECK-NEXT: br label %inner.body +; CHECK-NEXT: br label %inner.latch inner.header: ; Now the latch! ; CHECK: inner.header: br i1 undef, label %return, label %inner.body -; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_RETURN:[^,]*]], label %inner.body +; CHECK-NEXT: br i1 {{[^,]*}}, label %return, label %inner.body inner.body: ; Now the header! ; CHECK: inner.body: br i1 undef, label %outer.latch, label %inner.latch -; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_OUTER_LATCH:[^,]*]], label %inner.header +; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT]], label %inner.latch inner.latch: ; Dead! @@ -42,8 +48,7 @@ br label %outer.latch ; CHECK-NEXT: br label %outer.latch -; L2 -> L1 exit edge needs a simplified exit block. -; CHECK: [[INNER_SPLIT_OUTER_LATCH]]: +; CHECK: [[OUTER_LATCH_LOOPEXIT]]: ; CHECK-NEXT: br label %outer.latch outer.latch: @@ -51,14 +56,6 @@ br label %outer.header ; CHECK-NEXT: br label %outer.header -; L1 -> L0 exit edge need sa simplified exit block. -; CHECK: [[INNER_PREROTATE_PREHEADER_SPLIT_RETURN]]: -; CHECK-NEXT: br label %return - -; L2 -> L0 exit edge needs a simplified exit block. -; CHECK: [[INNER_SPLIT_RETURN]]: -; CHECK-NEXT: br label %return - return: ; CHECK: return: unreachable Index: llvm/test/Transforms/LoopRotate/simplifylatch.ll =================================================================== --- llvm/test/Transforms/LoopRotate/simplifylatch.ll +++ llvm/test/Transforms/LoopRotate/simplifylatch.ll @@ -3,8 +3,13 @@ @mode_table = global [4 x i32] zeroinitializer ; <[4 x i32]*> [#uses=1] +; Check that loop with multiple exits is rotated. ; CHECK-LABEL: @f( -; CHECK-NOT: bb: +; CHECK: bb.lr: +; CHECK: bb2.lr: +; CHECK: bb4.lr.ph: +; CHECK: bb: + define i8 @f() { entry: tail call i32 @fegetround( ) ; :0 [#uses=1] @@ -38,11 +43,21 @@ declare void @raise_exception() noreturn -;CHECK: for.body.lr.ph: -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0 -;CHECK-NEXT: %0 = load i8, i8* %arrayidx1, align 1 -;CHECK-NEXT: %conv2 = sext i8 %0 to i32 -;CHECK-NEXT: br label %for.body +;CHECK-LABEL: @foo( +;CHECK: for.body.lr: +;CHECK: %arrayidx.lr = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom.lr +;CHECK: %0 = load i8, i8* %arrayidx.lr, align 1 +;CHECK: %conv.lr = sext i8 %0 to i32 +;CHECK: %arrayidx1.lr = getelementptr inbounds i8, i8* %CurPtr, i64 0 +;CHECK: %1 = load i8, i8* %arrayidx1.lr, align 1 +;CHECK: %conv2.lr = sext i8 %1 to i32 +;CHECK: for.body: +;CHECK: %arrayidx = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom +;CHECK: %2 = load i8, i8* %arrayidx, align 1 +;CHECK: %conv = sext i8 %2 to i32 +;CHECK: %3 = load i8, i8* %arrayidx1, align 1 +;CHECK: %conv2 = sext i8 %3 to i32 +;CHECK: br i1 %cmp3, label %return, label %for.inc define i32 @foo(i8* %CurPtr, i32 %a) #0 { entry: Index: llvm/test/Transforms/LoopSimplify/ashr-crash.ll =================================================================== --- llvm/test/Transforms/LoopSimplify/ashr-crash.ll +++ llvm/test/Transforms/LoopSimplify/ashr-crash.ll @@ -29,9 +29,9 @@ ; CHECK-LABEL: entry: ; CHECK-LABEL: for.cond1.preheader: ; CHECK-LABEL: for.body3: -; CHECK: %cmp4.le.le -; CHECK: %conv.le.le = zext i1 %cmp4.le.le to i32 -; CHECK: %xor.le.le = xor i32 %conv6.le.le, 1 +; CHECK: %cmp4 +; CHECK: %conv = zext i1 %cmp4 to i32 +; CHECK: %xor = xor i32 %conv6, 1 define void @foo() { entry: br label %for.cond Index: llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll +++ llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll @@ -4,11 +4,11 @@ ; Loop from "rotated" ; CHECK: LV: Loop hints: force=enabled ; Loop from "nonrotated" -; CHECK: LV: Loop hints: force=enabled -; No more loops in the module ; CHECK-NOT: LV: Loop hints: force= -; In total only 1 loop should be rotated. -; CHECK: 1 loop-rotate +; checking the stats for the function @nonrotated +; CHECK: 2 gvn{{.*}}Number of equalities propagated +; With the new loop rotation both the loops should be rotated. +; CHECK: 2 loop-rotate target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu"