Index: llvm/include/llvm/Analysis/LoopInfo.h =================================================================== --- llvm/include/llvm/Analysis/LoopInfo.h +++ llvm/include/llvm/Analysis/LoopInfo.h @@ -456,13 +456,19 @@ /// iterations. bool isAnnotatedParallel() const; + /// Return the llvm.loop loop id metadata node for this loop and the + /// instruction which contains the metadata. If there is no such instruction + /// having a loop metadata then return a pair of nullptrs. + std::pair getLoopIDWithInstr() const; + /// Return the llvm.loop loop id metadata node for this loop if it is present. /// /// If this loop contains the same llvm.loop metadata on each branch to the /// header then the node is returned. If any latch instruction does not /// contain llvm.loop or or if multiple latches contain different nodes then /// 0 is returned. - MDNode *getLoopID() const; + MDNode *getLoopID() const { return getLoopIDWithInstr().first; } + /// Set the llvm.loop loop id metadata for this loop. /// /// The LoopID metadata node will be added to each terminator instruction in Index: llvm/include/llvm/IR/Instruction.h =================================================================== --- llvm/include/llvm/IR/Instruction.h +++ llvm/include/llvm/IR/Instruction.h @@ -201,6 +201,10 @@ getAllMetadataOtherThanDebugLocImpl(MDs); } + /// Get all the metadata IDs attached to this Instruction except the ones + /// associated with debug location. The Vector \p MDIDs is not cleared. + void getAllNonDebugMetadataIDs(SmallVectorImpl &MDIDs) const; + /// Fills the AAMDNodes structure with AA metadata from this instruction. /// When Merge is true, the existing AA metadata is merged with that from this /// instruction providing the most-general result. Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -197,7 +197,7 @@ // // LoopRotate - This pass is a simple loop rotating pass. // -Pass *createLoopRotatePass(int MaxHeaderSize = -1); +Pass *createLoopRotatePass(); //===----------------------------------------------------------------------===// // Index: llvm/lib/Analysis/LoopInfo.cpp =================================================================== --- llvm/lib/Analysis/LoopInfo.cpp +++ llvm/lib/Analysis/LoopInfo.cpp @@ -209,10 +209,12 @@ return true; } -MDNode *Loop::getLoopID() const { +std::pair Loop::getLoopIDWithInstr() const { MDNode *LoopID = nullptr; + Instruction *I = nullptr; if (BasicBlock *Latch = getLoopLatch()) { - LoopID = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop); + I = Latch->getTerminator(); + LoopID = I->getMetadata(LLVMContext::MD_loop); } else { // Go through each predecessor of the loop header and check the // terminator for the metadata. @@ -222,17 +224,18 @@ TerminatorInst *TI = BB->getTerminator(); if (MDNode *MD = TI->getMetadata(LLVMContext::MD_loop)) { - if (!LoopID) + if (!LoopID) { LoopID = MD; - else if (MD != LoopID) // Multiple MD_loop found => corrupt metadata. - return nullptr; + I = TI; + } else // Multiple MD_loop found. + return { nullptr, nullptr }; } } } if (!LoopID || LoopID->getNumOperands() == 0 || LoopID->getOperand(0) != LoopID) - return nullptr; - return LoopID; + return { nullptr, nullptr }; + return { LoopID, I }; } void Loop::setLoopID(MDNode *LoopID) const { Index: llvm/lib/IR/LLVMContextImpl.h =================================================================== --- llvm/lib/IR/LLVMContextImpl.h +++ llvm/lib/IR/LLVMContextImpl.h @@ -1050,6 +1050,9 @@ /// ID. This function does \em not clear \c Result. void getAll(SmallVectorImpl> &Result) const; + /// \brief Appends the IDs of all current attachments into \c Result. + void getAllIDs(SmallVectorImpl &Result) const; + /// \brief Erase matching attachments. /// /// Erases all attachments matching the \c shouldRemove predicate. Index: llvm/lib/IR/Metadata.cpp =================================================================== --- llvm/lib/IR/Metadata.cpp +++ llvm/lib/IR/Metadata.cpp @@ -1135,6 +1135,11 @@ array_pod_sort(Result.begin(), Result.end()); } +void MDAttachmentMap::getAllIDs(SmallVectorImpl &Result) const { + for (auto &A : Attachments) + Result.push_back(A.first); +} + void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) { Attachments.push_back({ID, TrackingMDNodeRef(&MD)}); } @@ -1298,6 +1303,15 @@ Info.getAll(Result); } +void +Instruction::getAllNonDebugMetadataIDs(SmallVectorImpl &MDIDs) const { + if (!hasMetadataOtherThanDebugLoc()) + return; + const auto &Info = getContext().pImpl->InstructionMetadata.find(this)->second; + assert(!Info.empty() && "Shouldn't have called this"); + Info.getAllIDs(MDIDs); +} + bool Instruction::extractProfMetadata(uint64_t &TrueVal, uint64_t &FalseVal) const { assert( Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -304,8 +304,10 @@ MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - // Rotate Loop - disable header duplication at -Oz - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + + // Rotate Loop - disable loop rotation at -Oz + if (SizeLevel != 2) + MPM.add(createLoopRotatePass()); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); MPM.add(createCFGSimplificationPass()); @@ -556,7 +558,8 @@ // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies // on the rotated form. Disable header duplication at -Oz. - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + if (SizeLevel != 2) + MPM.add(createLoopRotatePass()); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is Index: llvm/lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -8,7 +8,23 @@ //===----------------------------------------------------------------------===// // // This file implements Loop Rotation Pass. +// All the basic blocks which are between the loop-header and the basic blocks +// exiting the loop are copied during rotation. This is helpful for passes like +// gvn/licm which can now remove loop invariant code because some computations +// (which might not dominate all the basic blocks of loop) can now be PRE'd as +// they will be available outside of the loop (due to loop rotation). // +// Only the loops satisfying following properties are rotated: +// - The loop is an SEME (Single Entry Multiple Exit) region. +// - The loop has a header and its terminator conditionally exits the loop. +// - The loop has a latch and its terminator does not exit the loop. +// - The loop has preheader. +// +// Post conditions: +// The dominator tree is updated during loop rotation. +// A loop already in loop-simplified form remains in loop-simplified form. +// LoopInfo is preserved. +// The rotated loop and the cloned region are SEME regions. //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopRotation.h" @@ -29,597 +45,660 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" + +#include + using namespace llvm; #define DEBUG_TYPE "loop-rotate" -static cl::opt DefaultRotationThreshold( - "rotation-max-header-size", cl::init(16), cl::Hidden, - cl::desc("The default maximum header size for automatic loop rotation")); +static cl::opt RotationMaxSize( + "rotation-max-size", cl::init(100), cl::Hidden, + cl::desc("The default maximum loop size for automatic loop rotation")); + +static cl::opt MaxExits("lr-max-exits", cl::init(10), cl::Hidden, + cl::desc("The maximum exits to be cloned")); STATISTIC(NumRotated, "Number of loops rotated"); -namespace { +// Insert \p NewBB in between \p PredBefore and \p Succ, and redirect edges +// accordingly. PredBefore -> NewBB -> Succ. Also move NewBB before Succ. +static void insertBetween(BasicBlock *NewBB, BasicBlock *PredBefore, + BasicBlock *Succ) { + BranchInst *NewBI = BranchInst::Create(Succ, NewBB); + NewBI->setDebugLoc(PredBefore->getTerminator()->getDebugLoc()); + + BranchInst *BLI = dyn_cast(PredBefore->getTerminator()); + for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I) + if (BLI->getSuccessor(I) == Succ) { + BLI->setSuccessor(I, NewBB); + break; + } + // Move NewBB physically from the end of the block list. + Function *F = Succ->getParent(); + F->getBasicBlockList().splice(Succ->getIterator(), F->getBasicBlockList(), + NewBB); +} + +// Remove the arguments of all phi nodes in PhiBB coming from block From. +static void discardIncomingValues(BasicBlock *PhiBB, BasicBlock *From) { + for (Instruction &I : *PhiBB) { + PHINode *PN = dyn_cast(&I); + if (!PN) + break; + PN->removeIncomingValue(PN->getBasicBlockIndex(From)); + } +} + +/// Returns true of the region formed by [Entry, Exit] is a +/// single-entry-multiple-exit (SEME) region. All the traces from \p Entry +/// which leads to the \p Exit are analyzed. +static bool isSingleEntryMultipleExit(const BasicBlock *Entry, + const BasicBlock *Exit, + DominatorTree *DT) { + if (!DT->dominates(Entry, Exit)) + return false; + + for (auto I = idf_begin(Exit), E = idf_end(Exit); I != E;) { + if (*I == Entry) { + I.skipChildren(); + continue; + } + + if (!DT->dominates(Entry, *I)) + return false; + + ++I; + } + return true; +} + /// A simple loop rotation transformation. class LoopRotate { - const unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; AssumptionCache *AC; DominatorTree *DT; ScalarEvolution *SE; + Loop *L; + typedef SmallVectorImpl SmallVecBB; + typedef SmallPtrSetImpl SmallPtrSetBB; public: - LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - DominatorTree *DT, ScalarEvolution *SE) - : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) { - } - bool processLoop(Loop *L); + LoopRotate(LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE, Loop *L) + : LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), L(L) {} + bool processLoop(); private: - bool rotateLoop(Loop *L, bool SimplifiedLatch); - bool simplifyLoopLatch(Loop *L); -}; -} // end anonymous namespace - -/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the -/// old header into the preheader. If there were uses of the values produced by -/// these instruction that were outside of the loop, we have to insert PHI nodes -/// to merge the two values. Do this now. -static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, - BasicBlock *OrigPreheader, - ValueToValueMapTy &ValueMap) { - // Remove PHI node entries that are no longer live. - BasicBlock::iterator I, E = OrigHeader->end(); - for (I = OrigHeader->begin(); PHINode *PN = dyn_cast(I); ++I) - PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); - - // Now fix up users of the instructions in OrigHeader, inserting PHI nodes - // as necessary. - SSAUpdater SSA; - for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = &*I; - - // If there are no uses of the value (e.g. because it returns void), there - // is nothing to rewrite. - if (OrigHeaderVal->use_empty()) - continue; + void rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const; - Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); - - // The value now exits in two versions: the initial value in the preheader - // and the loop "next" value in the original header. - SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); - SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); - SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); - - // Visit each use of the OrigHeader instruction. - for (Value::use_iterator UI = OrigHeaderVal->use_begin(), - UE = OrigHeaderVal->use_end(); - UI != UE;) { - // Grab the use before incrementing the iterator. - Use &U = *UI; - - // Increment the iterator before removing the use from the list. - ++UI; - - // SSAUpdater can't handle a non-PHI use in the same block as an - // earlier def. We can easily handle those cases manually. - Instruction *UserInst = cast(U.getUser()); - if (!isa(UserInst)) { - BasicBlock *UserBB = UserInst->getParent(); - - // The original users in the OrigHeader are already using the - // original definitions. - if (UserBB == OrigHeader) - continue; + bool preserveLoopSimplifyForm(const SmallPtrSetBB &Exits) const; - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped. - if (UserBB == OrigPreheader) { - U = OrigPreHeaderVal; - continue; - } - } + bool isLoopSizeWithinLimits(const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const; - // Anything else can be handled by SSAUpdater. - SSA.RewriteUse(U); - } + bool isLegalToRotate() const; - // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug - // intrinsics. - LLVMContext &C = OrigHeader->getContext(); - if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) { - if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) { - for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) { - // Grab the use before incrementing the iterator. Otherwise, altering - // the Use will invalidate the iterator. - Use &U = *UI++; - DbgInfoIntrinsic *UserInst = dyn_cast(U.getUser()); - if (!UserInst) - continue; + void adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH, + BasicBlock *NewPH) const; - // The original users in the OrigHeader are already using the original - // definitions. - BasicBlock *UserBB = UserInst->getParent(); - if (UserBB == OrigHeader) - continue; + BasicBlock *collectSEMEBlocks(BasicBlock *OrigH, BasicBlock *OrigLatch, + SmallVecBB &Blocks, SmallPtrSetBB &Exits) const; - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped and anything else can be handled by - // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute undef. - Value *NewVal; - if (UserBB == OrigPreheader) - NewVal = OrigPreHeaderVal; - else if (SSA.HasValueForBlock(UserBB)) - NewVal = SSA.GetValueInMiddleOfBlock(UserBB); - else - NewVal = UndefValue::get(OrigHeaderVal->getType()); - U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal)); - } - } - } - } -} + PHINode *getOrCreatePHI(Instruction *Inst, BasicBlock *NewHeader, + BasicBlock *NewPreheader, BasicBlock *NewLatch, + ValueToValueMapTy &VMap) const; -/// Rotate loop LP. Return true if the loop is rotated. -/// -/// \param SimplifiedLatch is true if the latch was just folded into the final -/// loop exit. In this case we may want to rotate even though the new latch is -/// now an exiting branch. This rotation would have happened had the latch not -/// been simplified. However, if SimplifiedLatch is false, then we avoid -/// rotating loops in which the latch exits to avoid excessive or endless -/// rotation. LoopRotate should be repeatable and converge to a canonical -/// form. This property is satisfied because simplifying the loop latch can only -/// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { + void addNewPhisToNewHeader(const SmallVecBB &Blocks, BasicBlock *NewHeader, + BasicBlock *NewPreheader, BasicBlock *NewLatch, + ValueToValueMapTy &VMap) const; +}; + +// Check if there is something preventing loop from being rotated. +bool LoopRotate::isLegalToRotate() const { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); + const BasicBlock *LoopLatch = L->getLoopLatch(); + if (!LoopLatch || isa(LoopLatch->getTerminator())) + return false; + + const BasicBlock *OrigH = L->getHeader(); + // The header should have a conditional branch to within the loop for rotation + // to happen. In cases where the header does not conditionally branch to + // within the loop e.g., header with an invoke instruction rotation will fail. + const BranchInst *BI = dyn_cast(OrigH->getTerminator()); + if (!BI || !BI->isConditional()) + return false; - BranchInst *BI = dyn_cast(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) + // If the loop latch is exiting then, probably, this loop is already rotated. + if (L->isLoopExiting(LoopLatch)) return false; - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) + // If the loop header is not one of the loop exiting blocks then it might + // have been rotated already. + if (!L->isLoopExiting(OrigH)) return false; - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) return false; - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch) + if (!isSingleEntryMultipleExit(OrigH, LoopLatch, DT)) return false; + return true; +} + +// Check if the size of rotated loop will be within limits. +bool LoopRotate::isLoopSizeWithinLimits(const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const { // Check size of original header and reject loop if it is very big or we can't // duplicate blocks inside it. - { - SmallPtrSet EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - + if (Exits.size() > MaxExits) + return false; + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + unsigned LoopSize = 0; + for (const BasicBlock *BB : Blocks) { CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); + Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + // TODO: Modify this because there might be blocks with indirectbr, invoke + // in the loop but we can cut the cloning part at that point and that will + // be the last exit BB. if (Metrics.notDuplicatable) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" << " instructions: "; L->dump()); return false; } + if (Metrics.convergent) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " "instructions: "; L->dump()); return false; } - if (Metrics.NumInsts > MaxHeaderSize) + + LoopSize += Metrics.NumInsts; + // TODO: Even if the loop's size is greater than RotationMaxSize, what we + // can do is peel fewer basic blocks than needed to completely rotate the + // loop. That way, at least, some redundancies will be exposed. + if (LoopSize >= RotationMaxSize) return false; } + return true; +} - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); +// Return a PHI for incoming values from NewPreHeader and NewLatch. If such a +// PHI already exsits then return that otherwise create one. NewHeader has two +// predecessors NewPreHeader and NewLatch. Inst is the original instruction +// for which the PHI is to be created, its value is coming from NewLatch +// and its cloned value VMap[Inst] is coming from NewPreheader. +PHINode *LoopRotate::getOrCreatePHI(Instruction *Inst, BasicBlock *NewHeader, + BasicBlock *NewPreheader, + BasicBlock *NewLatch, + ValueToValueMapTy &VMap) const { + // Look within existing PHIs having same incoming values. + for (Instruction &I: *NewHeader) { + PHINode *PN = dyn_cast(&I); + if (!PN) + break; + assert(PN->getNumOperands() == 2); + int NLVal = PN->getBasicBlockIndex(NewLatch); + assert(NLVal >= 0); + int NPHVal = PN->getBasicBlockIndex(NewPreheader); + assert(NPHVal >= 0); + if (PN->getIncomingValue(NLVal) == Inst && + PN->getIncomingValue(NPHVal) == VMap[Inst]) + return PN; + } - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader) - return false; + PHINode *PN = PHINode::Create(Inst->getType(), 2, "phi.nh", + &*NewHeader->begin()); + PN->addIncoming(Inst, NewLatch); + PN->addIncoming(cast(VMap[Inst]), NewPreheader); - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. - if (SE) - SE->forgetLoop(L); + return PN; +} - DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); +// Add phis to the new header and adjust the phi nodes from the OrigHeader. +void LoopRotate::addNewPhisToNewHeader(const SmallVecBB &Blocks, + BasicBlock *NewHeader, + BasicBlock *NewPreheader, + BasicBlock *NewLatch, + ValueToValueMapTy &VMap) const { + // Add to NewHeader, phi nodes for all copied variables which are used. + for (BasicBlock *BB : Blocks) { + for (Instruction &Inst : *BB) { + // Skip Inst with no use e.g., branches. + if (Inst.use_begin() == Inst.use_end()) + continue; - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - if (L->contains(Exit)) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast(I); ++I) - ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); - - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !isa(Inst) && - !isa(Inst) && !isa(Inst)) { - Inst->moveBefore(LoopEntryBranch); - continue; - } + for (auto UI = Inst.use_begin(), E = Inst.use_end(); UI != E;) { + Use &U = *UI++; + Instruction *UserInst = cast(U.getUser()); + + // Nothing to rename when the use is dominated by the definition. + if (DT->dominates(&Inst, UserInst)) + continue; + + if (!L->contains(UserInst->getParent())) { + // Handle uses in the loop-closed-phi. + PHINode *ClosePhi = cast(UserInst); + BasicBlock *Pred = ClosePhi->getIncomingBlock(U.getOperandNo()); + + // Do not rename a loop close phi node if its predecessor in the loop + // is dominated by Inst. + if (L->contains(Pred) && DT->dominates(BB, Pred)) + continue; + } + + PHINode* PN = getOrCreatePHI(&Inst, NewHeader, NewPreheader, NewLatch, VMap); - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - // FIXME: Provide TLI, DT, AC to SimplifyInstruction. - Value *V = SimplifyInstruction(C, DL); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - ValueMap[Inst] = V; - if (!C->mayHaveSideEffects()) { - delete C; - C = nullptr; + // When Inst does not dominate U, it is going to use the updated + // definition coming from PN. + U.set(PN); } - } else { - ValueMap[Inst] = C; - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - C->insertBefore(LoopEntryBranch); - - if (auto *II = dyn_cast(C)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); } } +} + +// Add incoming values to the (already present) PHIs of NewH. +void LoopRotate::adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH, + BasicBlock *NewPH) const { + for (Instruction &Inst : *NewH) { + PHINode *PN = dyn_cast(&Inst); + if (!PN) + break; + assert((PN->getNumOperands() == 1) && "NewH had multiple predecessors."); + Value *Op = PN->getIncomingValue(0); + if (Value *RenamedVal = VMap[Op]) + PN->addIncoming(RenamedVal, NewPH); + else // When no mapping is available (e.g., in case of a constant). + PN->addIncoming(Op, NewPH); + } +} - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - TerminatorInst *TI = OrigHeader->getTerminator(); - for (BasicBlock *SuccBB : TI->successors()) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - if (!isa(PHBI->getCondition()) || - PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != - NewHeader) { - // The conditional branch can't be folded, handle the general case. - // Update DominatorTree to reflect the CFG change we just made. Then split - // edges as necessary to preserve LoopSimplify form. - if (DT) { - // Everything that was dominated by the old loop header is now dominated - // by the original loop preheader. Conceptually the header was merged - // into the preheader, even though we reuse the actual block as a new - // loop latch. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); - SmallVector HeaderChildren(OrigHeaderNode->begin(), - OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); - for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - - assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); - - // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(OrigHeader, OrigLatch); +// OrigH and OrigLatch represent an SEME region. Collect all BasicBlocks +// (bounded by \p OrigH and \p OrigLatch), which are exiting the loop in +// \p Blocks and collect all the exits from the region in \p Exits. Returns the +// first basic block which will not be cloned. +BasicBlock *LoopRotate::collectSEMEBlocks(BasicBlock *OrigH, + BasicBlock *OrigLatch, + SmallVecBB &Blocks, + SmallPtrSetBB &Exits) const { + BasicBlock *NewH = nullptr; + for (auto BB = df_begin(OrigH), E = df_end(OrigH); BB != E;) { + if (!L->contains(*BB)) { + BB.skipChildren(); + continue; } - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector ExitPreds(pred_begin(Exit), pred_end(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit)) - continue; - if (isa(ExitPred->getTerminator())) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); + // Copy until any BB where the branch does not exit loop, or the loop-latch. + if (OrigLatch == *BB || !L->isLoopExiting(*BB) || + !isa((*BB)->getTerminator())) { + // This will become the new header. + NewH = *BB; + BB.skipChildren(); + } else { + Blocks.push_back(*BB); + + BranchInst *BI = cast((*BB)->getTerminator()); + for (unsigned B = 0, E = BI->getNumSuccessors(); B < E; ++B) { + BasicBlock *Succ = BI->getSuccessor(B); + if (!L->contains(Succ)) + Exits.insert(Succ); + } + ++BB; } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); - - // With our CFG finalized, update DomTree if it is available. - if (DT) { - // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(NewHeader, OrigPreheader); - DT->changeImmediateDominator(OrigHeader, OrigLatch); - - // Brute force incremental dominator tree update. Call - // findNearestCommonDominator on all CFG predecessors of each child of the - // original header. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); - SmallVector HeaderChildren(OrigHeaderNode->begin(), - OrigHeaderNode->end()); - bool Changed; - do { - Changed = false; - for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) { - DomTreeNode *Node = HeaderChildren[I]; - BasicBlock *BB = Node->getBlock(); - - pred_iterator PI = pred_begin(BB); - BasicBlock *NearestDom = *PI; - for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); - - // Remember if this changes the DomTree. - if (Node->getIDom()->getBlock() != NearestDom) { - DT->changeImmediateDominator(BB, NearestDom); - Changed = true; - } - } + } + return NewH; +} - // If the dominator changed, this may have an effect on other - // predecessors, continue until we reach a fixpoint. - } while (Changed); +// Helper function for copySEME. Adjusts the PHIs of all the \p Exits bounding +// an SEME. \p VMap contains mapping of original BB vs copied BB. +static void adjustExitingPhis(ValueToValueMapTy &VMap, + const SmallPtrSetImpl &Exits) { + for (BasicBlock *BB : Exits) { + for (Instruction &Inst : *BB) { + PHINode *PN = dyn_cast(&Inst); + if (!PN) + break; + bool EdgeFromOrigBB = false; + for (unsigned i = 0, e = PN->getNumOperands(); i != e; ++i) { + Value *CopyB = VMap[PN->getIncomingBlock(i)]; + if (!CopyB) // Skip args coming from outside the SEME. + continue; + BasicBlock *CopyBB = cast(CopyB); + EdgeFromOrigBB = true; + Value *Op = PN->getIncomingValue(i); + if (Value *RenamedVal = VMap[Op]) + PN->addIncoming(RenamedVal, CopyBB); + else + // When no mapping is available it may be a constant, + // a function argument, or a global, add as it is. + PN->addIncoming(Op, CopyBB); + } + assert(EdgeFromOrigBB && "Illegal exit from SEME."); } } +} - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); +/// Clones the basic blocks (\p Blocks) of an SEME bounded by \p Exits. +/// Blocks[0] is the entry basic block of the SEME. +/// The mapping between original BBs and correponding copied BBs are +/// populated in \p VMap. During copy the DOM and LI of CFG are updated. +/// \returns The entry point of the copied SEME. \p NameSuffix is used to suffix +/// name of the copied BBs. The copied SEME is also an SEME. +static BasicBlock* copyBlocks(const SmallVectorImpl &Blocks, + const SmallPtrSetImpl &Exits, + ValueToValueMapTy &VMap, + const Twine &NameSuffix, + DominatorTree *DT, LoopInfo *LI) { + // Step1: Clone the basic blocks and populate VMap. + BasicBlock *OrigH = Blocks[0]; + + Function *F = OrigH->getParent(); + SmallVector NewBlocks; + for (BasicBlock *BB : Blocks) { + assert(!isa(BB->getTerminator()) && + "Cannot clone SEME with indirect branches."); + + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(), + NewBB); + Loop *BBLoop = LI->getLoopFor(BB); + Loop *BBParentLoop = BBLoop->getParentLoop(); + if (BBParentLoop) + BBParentLoop->addBasicBlockToLoop(NewBB, *LI); + VMap[BB] = NewBB; + NewBlocks.push_back(NewBB); + } - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - MergeBlockIntoPredecessor(OrigHeader, DT, LI); + // Step2: Remap the names in copied BBs. + remapInstructionsInBlocks(NewBlocks, VMap); - DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + // Step3: Redirect the edges. + for (BasicBlock *BB : Blocks) { + BasicBlock *NewBB = cast(VMap[BB]); + BranchInst *BI = dyn_cast(NewBB->getTerminator()); + if (!BI) + continue; - ++NumRotated; - return true; -} + for (unsigned I = 0, E = BI->getNumSuccessors(); I < E; ++I) + if (auto *NewSucc = cast_or_null(VMap[BI->getSuccessor(I)])) + BI->setSuccessor(I, NewSucc); + } -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End, Loop *L) { - bool seenIncrement = false; - bool MultiExitLoop = false; + // Step4: Update the DOM of copied SEME. Except for the entry block its tree + // structure is the same as of original SEME so the dominators also follow the + // same structural property. If the IDom of original BB is not in SEME that + // means it is the entry block, in that case the new IDom of the new BB must + // be its single predecessor because we are dealing with an SEME region. + BasicBlock *EntryNewSEME = nullptr; + if (auto *DomT = DT->getNode(Blocks[0])->getIDom()) { + // Entry to SEME has a dominator, update the copied entry. + BasicBlock *Dom = DomT->getBlock(); + assert(!VMap[Dom]); // Dom does not belong to SEME => entry block. + BasicBlock *NewBB = cast(VMap[Blocks[0]]); + EntryNewSEME = NewBB; + DT->addNewBlock(NewBB, Dom); + DT->changeImmediateDominator(NewBB, Dom); + } - if (!L->getExitingBlock()) - MultiExitLoop = true; + for (auto BBI = std::next(Blocks.begin()); BBI != Blocks.end(); ++BBI) { + BasicBlock *NewBB = cast(VMap[*BBI]); + BasicBlock *Dom = DT->getNode(*BBI)->getIDom()->getBlock(); + BasicBlock *NewDom = cast(VMap[Dom]); + DT->addNewBlock(NewBB, NewDom); + DT->changeImmediateDominator(NewBB, NewDom); + } - for (BasicBlock::iterator I = Begin; I != End; ++I) { + // Step5: Adjust PHI nodes for edges exiting the SEME. + adjustExitingPhis(VMap, Exits); + return EntryNewSEME; +} - if (!isSafeToSpeculativelyExecute(&*I)) - return false; +bool LoopRotate::preserveLoopSimplifyForm(const SmallPtrSetBB &Exits) const { + bool HasIndirectBr = false; + // Split the edge from exiting BB to exit BB if not in loop-simplify form. + // We do not guarantee loop-simplify form to be preserved if the original + // loop has indirect exiting branches. + for (BasicBlock *BB: Exits) { + if (!BB->getSinglePredecessor()) { + // Splitting edges on the fly causes predecessor list to be corrupted + // so collect all the predecessors of BB to be split. + SmallVector EdgesToBeSplit; + for (BasicBlock *Pred : predecessors(BB)) + if (L->contains(Pred)) { + if (isa(Pred->getTerminator())) + HasIndirectBr = true; + else + EdgesToBeSplit.push_back(Pred); + } + for (BasicBlock *Pred: EdgesToBeSplit) + SplitEdge(Pred, BB, DT, LI); // Split Pred->BB + } + } + return HasIndirectBr; +} - if (isa(I)) - continue; +/// Rotate loop L. +/// Rotate the SEME (\p Blocks) bounded by \p Exits. +/// Blocks[0] is the entry basic block of the SEME. +void LoopRotate::rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks, + const SmallPtrSetBB &Exits) const { + bool LoopSimplifyForm = false; + DEBUG(LoopSimplifyForm = L->isLoopSimplifyForm()); + BasicBlock *OrigH = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + BasicBlock *OrigPH = L->getLoopPreheader(); - switch (I->getOpcode()) { - default: - return false; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast(I)->hasAllConstantIndices()) - return false; - // fall-thru to increment case - LLVM_FALLTHROUGH; - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: { - Value *IVOpnd = - !isa(I->getOperand(0)) - ? I->getOperand(0) - : !isa(I->getOperand(1)) ? I->getOperand(1) : nullptr; - if (!IVOpnd) - return false; - - // If increment operand is used outside of the loop, this speculation - // could cause extra live range interference. - if (MultiExitLoop) { - for (User *UseI : IVOpnd->users()) { - auto *UserInst = cast(UseI); - if (!L->contains(UserInst)) - return false; - } - } + DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - if (seenIncrement) - return false; - seenIncrement = true; + /// The mapping between original BBs in Blocks and correponding copied BBs are + /// populated in VMap. + ValueToValueMapTy VMap; + copyBlocks(Blocks, Exits, VMap, ".lr", DT, LI); + + // Redirect original preheader to the entry of copied SEME. + BranchInst *OrigPHBI = dyn_cast(OrigPH->getTerminator()); + assert(OrigPHBI && (1 == OrigPHBI->getNumSuccessors()) && + "Preheader does not have single successor."); + + BasicBlock *CopyOrigH = cast(VMap[OrigH]); + OrigPHBI->setSuccessor(0, CopyOrigH); + DT->changeImmediateDominator(CopyOrigH, OrigPH); + L->moveToHeader(NewH); + + BasicBlock *BeforeLoop = nullptr; + for (BasicBlock *BB : predecessors(NewH)) + if (!L->contains(BB)) { + BeforeLoop = BB; break; } - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - // ignore type conversions - break; + assert(BeforeLoop && "No entry point to the loop from New Header."); + + Function *F = BeforeLoop->getParent(); + // Move NewH physically to the beginning of the loop. + F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(), + NewH); + + // SplitEdge does not work properly with single-entry PHIs. + BasicBlock *NewPH = BasicBlock::Create( + NewH->getContext(), NewH->getName() + ".lr.ph", F, BeforeLoop); + + Loop *OuterLoop = LI->getLoopFor(OrigPH); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(NewPH, *LI); + + // BeforeLoop --> NewPH --> NewH. + insertBetween(NewPH, BeforeLoop, NewH); + + DT->addNewBlock(NewPH, BeforeLoop); + DT->changeImmediateDominator(NewPH, BeforeLoop); + DT->changeImmediateDominator(NewH, NewPH); + + // Also, the original entry lost its immediate dominator so its dominator + // should be adjusted. We use SEME property => idom (OrigH) = its single pred. + DT->changeImmediateDominator(OrigH, OrigH->getSinglePredecessor()); + + // Adjust the dominators of original SEME. + for (BasicBlock *BB : Blocks) { + typedef DomTreeNodeBase DTNode; + // Get the subtree of BB in the dominator tree. + DTNode *DTBB = (*DT)[BB]; + std::vector::const_iterator I = DTBB->begin(); + while (I != DTBB->end()) { + BasicBlock *ExitBB = (*I)->getBlock(); + if (L->contains(ExitBB)) { + ++I; + continue; + } + BasicBlock *StaleIDom = DT->getNode(ExitBB)->getIDom()->getBlock(); + BasicBlock *NewBB = cast(VMap[BB]); + // NewIDom is correct because this part of CFG is up-to-date. + BasicBlock *NewIDom = DT->findNearestCommonDominator(StaleIDom, NewBB); + NewIDom = DT->findNearestCommonDominator(NewIDom, BB); + if (NewIDom != StaleIDom) { + DT->changeImmediateDominator(ExitBB, NewIDom); + DEBUG(dbgs() << "\nChanging IDom of " << *ExitBB << "to" << *NewIDom); + I = DTBB->begin(); + } else + ++I; } } - return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || Latch->hasAddressTaken()) - return false; - BranchInst *Jmp = dyn_cast(Latch->getTerminator()); - if (!Jmp || !Jmp->isUnconditional()) - return false; + adjustNewHeaderPhis(VMap, NewH, NewPH); - BasicBlock *LastExit = Latch->getSinglePredecessor(); - if (!LastExit || !L->isLoopExiting(LastExit)) - return false; + BasicBlock *NewLatch = L->getLoopLatch(); + assert(L->getLoopPreheader() && "Invalid loop preheader after rotation"); + assert(NewLatch && "Invalid loop latch after rotation"); - BranchInst *BI = dyn_cast(LastExit->getTerminator()); - if (!BI) - return false; + addNewPhisToNewHeader(Blocks, NewH, NewPH, NewLatch, VMap); - if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) - return false; + // Discard incoming values in the CopyOrigHeader, which are coming from + // OrigLatch since it has only one predecessor. + discardIncomingValues(CopyOrigH, OrigLatch); + discardIncomingValues(OrigH, OrigPH); + const bool HasIndirectBr = preserveLoopSimplifyForm(Exits); - DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); + DEBUG(DT->verifyDomTree()); + DEBUG(LI->verify(*DT)); - // Hoist the instructions from Latch into LastExit. - LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), - Latch->begin(), Jmp->getIterator()); + // If the loop was not in loop-simplify form then after the loop + // rotation, it is still not going to be in that form. + if (!HasIndirectBr && LoopSimplifyForm) + assert(L->isLoopSimplifyForm() && "Loop simplify form not preserved."); + assert(L->isRecursivelyLCSSAForm(*DT, *LI) && "Loop is not in LCSSA form."); - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); + DEBUG(dbgs() << "\nLoopRotation: rotated "; L->dumpVerbose()); - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); + assert(isSingleEntryMultipleExit(L->getHeader(), NewLatch, DT) && + "Rotated loop not an SEME"); + assert(isSingleEntryMultipleExit(CopyOrigH, NewPH, DT) && + "Copied SEME not an SEME"); - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); - return true; + ++NumRotated; } /// Rotate \c L, and return true if any modification was made. -bool LoopRotate::processLoop(Loop *L) { +bool LoopRotate::processLoop() { + if (!isLegalToRotate()) + return false; + + BasicBlock *OrigH = L->getHeader(); + BasicBlock *LoopLatch = L->getLoopLatch(); + + // Basic blocks to be copied. + SmallVector Blocks; + SmallPtrSet Exits; + // Collect all nodes of the loop from header to latch. + BasicBlock *NewH = collectSEMEBlocks(OrigH, LoopLatch, Blocks, Exits); + assert(NewH && "Invalid SEME region."); + + if (!isLoopSizeWithinLimits(Blocks, Exits)) + return false; + // Save the loop metadata. - MDNode *LoopMD = L->getLoopID(); + MDNode *LoopMD = nullptr; + Instruction *TI = nullptr; + std::tie(LoopMD, TI) = L->getLoopIDWithInstr(); + + // Make sure the latch has only one successor. + if (!LoopLatch->getSingleSuccessor()) { + // Since LoopLatch was skipped while collecting SEME blocks, exits out of + // the loop-latch is collected here. + BranchInst *BLI = dyn_cast(LoopLatch->getTerminator()); + if (!BLI) + return false; + // If the NewH is the loop-latch => Exits does not include the exiting block + // from loop-latch (see collectSEMEBlocks). + if (NewH == LoopLatch) + for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I) { + BasicBlock *Succ = BLI->getSuccessor(I); + if (Succ != OrigH && !L->contains(Succ)) + Exits.insert(Succ); + } + + // The old loop-latch will become the parent of new loop-latch. + DEBUG(dbgs() << "\nSplitting the edge of Loop:"; L->dumpVerbose();); + BasicBlock *NewLoopLatch = SplitEdge(LoopLatch, OrigH, DT, LI); + + // If the NewH is the loop-latch => NewH would change as the loop-latch has + // changed after splitting. Also, we need to add the (old) loop-latch to the + // blocks because it was skipped (all the blocks before NewH are copied to + // new SEME). + if (NewH == LoopLatch) { + NewH = NewLoopLatch; + Blocks.push_back(LoopLatch); + } + LoopLatch = NewLoopLatch; + } - // Simplify the loop latch before attempting to rotate the header - // upward. Rotation may not be needed if the loop tail can be folded into the - // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L); + assert(LoopLatch->getSingleSuccessor() && "Invalid SEME region."); - bool MadeChange = rotateLoop(L, SimplifiedLatch); - assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && + // Anything ScalarEvolution may know about this loop or the PHI nodes in its + // header will soon be invalidated. + if (SE) + SE->forgetLoop(L); + + rotateLoop(NewH, Blocks, Exits); + assert(L->isLoopExiting(L->getLoopLatch()) && "Loop latch should be exiting after loop-rotate."); - // Restore the loop metadata. - // NB! We presume LoopRotation DOESN'T ADD its own metadata. - if ((MadeChange || SimplifiedLatch) && LoopMD) + if (LoopMD) { + // Drop loop metadata from the old loop header. + SmallVector MDs; + TI->getAllNonDebugMetadataIDs(MDs); + MDs.erase(llvm::remove_if(MDs, [](unsigned ID) { + return ID == LLVMContext::MD_loop; }), MDs.end()); + + TI->dropUnknownNonDebugMetadata(MDs); + + // Restore the loop metadata to the new loop header. L->setLoopID(LoopMD); + } - return MadeChange; + return true; } LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication) @@ -628,10 +707,9 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { - int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0; - LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE); + LoopRotate LR(&AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, &L); - bool Changed = LR.processLoop(&L); + bool Changed = LR.processLoop(); if (!Changed) return PreservedAnalyses::all(); @@ -641,22 +719,17 @@ namespace { class LoopRotateLegacyPass : public LoopPass { - unsigned MaxHeaderSize; - public: static char ID; // Pass ID, replacement for typeid - LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + LoopRotateLegacyPass() : LoopPass(ID) { initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); } // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -672,8 +745,8 @@ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *SEWP = getAnalysisIfAvailable(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; - LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE); - return LR.processLoop(L); + LoopRotate LR(LI, TTI, AC, DT, SE, L); + return LR.processLoop(); } }; } @@ -683,10 +756,9 @@ false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, false) -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { - return new LoopRotateLegacyPass(MaxHeaderSize); -} +Pass *llvm::createLoopRotatePass() { return new LoopRotateLegacyPass(); } Index: llvm/test/Analysis/GlobalsModRef/memset-escape.ll =================================================================== --- llvm/test/Analysis/GlobalsModRef/memset-escape.ll +++ llvm/test/Analysis/GlobalsModRef/memset-escape.ll @@ -6,15 +6,19 @@ @a = internal global [3 x i32] zeroinitializer, align 4 @b = common global i32 0, align 4 -; The important thing we're checking for here is the reload of (some element of) -; @a after the memset. +; Check that load and the call to abort is redundant. +; CHECK: store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4 +; CHECK: store i32 0, i32* @b, align 4 +; CHECK: br label %for.body -; CHECK-LABEL: @main -; CHECK: call void @llvm.memset.p0i8.i64{{.*}} @a -; CHECK: store i32 3 -; CHECK: load i32, i32* getelementptr {{.*}} @a -; CHECK: icmp eq i32 -; CHECK: br i1 +; CHECK: for.body: ; preds = %for.body.lr.ph +; CHECK: store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 0), align 4 +; CHECK: store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4 +; CHECK: store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4 +; CHECK: store i32 3, i32* @b, align 4 +; CHECK: br i1 true, label %if.end, label %if.then +; CHECK-NOT: load +; CHECK-NOT: call void @abort() define i32 @main() { entry: Index: llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll +++ /dev/null @@ -1,43 +0,0 @@ -; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -instcombine -indvars -loop-deletion -constmerge -S | FileCheck %s -; PR11882: ComputeLoadConstantCompareExitLimit crash. -; -; for.body is deleted leaving a loop-invariant load. -; CHECK-NOT: for.body -target datalayout = "e-p:64:64:64-n32:64" - -@func_21_l_773 = external global i32, align 4 -@g_814 = external global i32, align 4 -@g_244 = internal global [1 x [0 x i32]] zeroinitializer, align 4 - -define void @func_21() nounwind uwtable ssp { -entry: - br label %lbl_818 - -lbl_818: ; preds = %for.end, %entry - call void (...) @func_27() - store i32 0, i32* @g_814, align 4 - br label %for.cond - -for.cond: ; preds = %for.body, %lbl_818 - %0 = load i32, i32* @g_814, align 4 - %cmp = icmp sle i32 %0, 0 - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %idxprom = sext i32 %0 to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* getelementptr inbounds ([1 x [0 x i32]], [1 x [0 x i32]]* @g_244, i32 0, i64 0), i32 0, i64 %idxprom - %1 = load i32, i32* %arrayidx, align 1 - store i32 %1, i32* @func_21_l_773, align 4 - store i32 1, i32* @g_814, align 4 - br label %for.cond - -for.end: ; preds = %for.cond - %2 = load i32, i32* @func_21_l_773, align 4 - %tobool = icmp ne i32 %2, 0 - br i1 %tobool, label %lbl_818, label %if.end - -if.end: ; preds = %for.end - ret void -} - -declare void @func_27(...) Index: llvm/test/Transforms/LoopRotate/basic.ll =================================================================== --- llvm/test/Transforms/LoopRotate/basic.ll +++ llvm/test/Transforms/LoopRotate/basic.ll @@ -18,9 +18,10 @@ %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 br i1 %cmp, label %for.body, label %for.end -; CHECK: for.body: +; CHECK: for.cond.lr: ; CHECK-NEXT: phi i32 [ 0 -; CHECK-NEXT: store i32 0 +; CHECK: for.body: +; CHECK: store i32 0 for.body: ; preds = %for.cond store i32 0, i32* %arrayidx, align 16 Index: llvm/test/Transforms/LoopRotate/dbgvalue.ll =================================================================== --- llvm/test/Transforms/LoopRotate/dbgvalue.ll +++ llvm/test/Transforms/LoopRotate/dbgvalue.ll @@ -5,9 +5,9 @@ define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp !dbg !0 { ; CHECK-LABEL: define i32 @tak( -; CHECK: entry -; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %x -; CHECK: tail call void @llvm.dbg.value(metadata i32 %call +; CHECK: tailrecurse.lr +; CHECK: call void @llvm.dbg.value(metadata i32 %x +; CHECK: tail call void @llvm.dbg.value(metadata i32 %y entry: br label %tailrecurse @@ -42,7 +42,7 @@ ; CHECK-LABEL: define i32 @tak2( ; CHECK: entry ; CHECK: tail call void @llvm.dbg.value(metadata i32 %x.tr -; CHECK: tail call void @llvm.dbg.value(metadata i32 undef +; CHECK: tail call void @llvm.dbg.value(metadata i32 %y.tr entry: br label %tailrecurse @@ -83,12 +83,13 @@ ; Ensure that the loop increment basic block is rotated into the tail of the ; body, even though it contains a debug intrinsic call. ; CHECK-LABEL: define void @FindFreeHorzSeg( +; CHECK: for.inc: +; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ] ; CHECK: %dec = add ; CHECK-NEXT: tail call void @llvm.dbg.value +; CHECK-NEXT: br label %for.cond ; CHECK: %cmp = icmp ; CHECK: br i1 %cmp -; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ] -; CHECK-NEXT: br label %for.end entry: Index: llvm/test/Transforms/LoopRotate/loop-rotate.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/loop-rotate.ll @@ -0,0 +1,284 @@ +; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Check that a basic while loop is rotated. +; CHECK-LABEL: @bar0 +; CHECK: while.cond.lr: ; preds = %entry +; CHECK: while.body.lr.ph: ; preds = %while.cond.lr + +declare void @foo0(i64* %this, i64* %that) + +define void @bar0(i64* %__begin1, i64* %__end1, i64** %__end2) { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %__end1.addr.0 = phi i64* [ %__end1, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i64* %__end1.addr.0, %__begin1 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = load i64*, i64** %__end2, align 8 + tail call void @foo0(i64* %__begin1, i64* %incdec.ptr) + store i64* %incdec.ptr, i64** %__end2, align 8 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; Check that the loop is rotated and multiple phis are updated. +; CHECK-LABEL: @foo2 +; CHECK: do.body.i10.lr: ; preds = %if.else.i3 +; CHECK: do.cond.i19.lr.ph: ; preds = %do.body.i10.lr + +define void @foo2() { +entry: + br i1 undef, label %for.body, label %for.end + +for.body: ; preds = %entry + br i1 undef, label %inverse.exit21, label %if.else.i3 + +if.else.i3: ; preds = %for.body + br label %do.body.i10 + +do.body.i10: ; preds = %do.cond.i19, %if.else.i3 + %b1.0.i4 = phi i64 [ 0, %if.else.i3 ], [ %b2.0.i7, %do.cond.i19 ] + %b2.0.i7 = phi i64 [ 1, %if.else.i3 ], [ %sub8.i18, %do.cond.i19 ] + br i1 undef, label %do.cond.thread.i15, label %do.cond.i19 + +do.cond.thread.i15: ; preds = %do.body.i10 + br label %inverse.exit21 + +do.cond.i19: ; preds = %do.body.i10 + %mul.i17 = mul nsw i64 undef, %b2.0.i7 + %sub8.i18 = sub nsw i64 %b1.0.i4, %mul.i17 + br label %do.body.i10 + +inverse.exit21: ; preds = %do.cond.thread.i15, %for.body + br label %for.end + +for.end: ; preds = %inverse.exit21, %entry + ret void +} + +; Check that the loop with switch block within loop is rotated and the critical edge is first split. +; CHECK-LABEL: @foo3 +; CHECK: entry: +; CHECK: br i1 %cmp1, label %while.cond.preheader, label %while.exit +; CHECK: while.cond.lr: ; preds = %while.cond.preheader +; CHECK: %conv7.lr = ashr exact i64 undef, 32 +; CHECK: br i1 %cmp, label %if.else106.lr.ph, label %while.exit.loopexit +; CHECK: if.else106.lr.ph: ; preds = %while.cond.lr +; CHECK: if.else106: ; preds = %if.else106.lr.ph, %while.cond +; CHECK: switch i32 %val, label %if.else106.while.exit.loopexit_crit_edge [ +; CHECK: if.else106.while.exit.loopexit_crit_edge: ; preds = %if.else106 +; CHECK: while.cond: ; preds = %if.then467 +; CHECK: %conv7 = ashr exact i64 undef, 32 +; CHECK: br i1 %cmp, label %if.else106, label %while.cond.while.exit.loopexit_crit_edge +; CHECK: while.cond.while.exit.loopexit_crit_edge: ; preds = %while.cond +; CHECK: if.then130: ; preds = %if.else106 +; CHECK: if.then467: ; preds = %if.then130, %if.else106 +; CHECK: while.exit.loopexit: ; preds = %if.else106.while.exit.loopexit_crit_edge, %while.cond.while.exit.loopexit_crit_edge, %while.cond.lr +; CHECK: while.exit: ; preds = %while.exit.loopexit, %entry + +define i32 @foo3(i1 %cmp, i1 %cmp1, i32 %val) { +entry: + br i1 %cmp1, label %while.cond, label %while.exit + +while.cond: + %conv7 = ashr exact i64 undef, 32 + br i1 %cmp, label %if.else106, label %while.exit + +if.else106: + switch i32 %val, label %while.exit [ + i32 12, label %if.then130 + i32 30, label %if.then467 + ] + +if.then130: + br label %if.then467 + +if.then467: + br label %while.cond + +while.exit: + ret i32 0 +} + +; Check that the loop with pre-header and loop-body having a switch block is rotated. +; CHECK-LABEL: @bar5 +; CHECK: bb19.preheader: ; preds = %bb10, %bb10 +; CHECK: bb19.lr: ; preds = %bb19.preheader +; CHECK: bb12.lr.ph: ; preds = %bb19.lr + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i32 @foo5(i8*, i32) + +define i32 @bar5(i8* %arg, i32 %arg1, i8* %arg2, i32 %arg3) { +bb10: + switch i32 %arg3, label %bb11 [ + i32 46, label %bb19 + i32 32, label %bb19 + ] + +bb11: ; preds = %bb10 + ret i32 -1 + +bb12: ; preds = %bb19 + switch i8 undef, label %bb13 [ + i8 32, label %bb21 + i8 46, label %bb21 + ] + +bb13: ; preds = %bb12 + br label %bb15 + +bb15: ; preds = %bb13 + br i1 undef, label %bb17, label %bb16 + +bb16: ; preds = %bb15 + %tmp = call i32 @foo5(i8* %tmp20, i32 10) + ret i32 0 + +bb17: ; preds = %bb15 + %tmp18 = phi i8* [ %tmp20, %bb15 ] + br label %bb19 + +bb19: ; preds = %bb17, %bb10, %bb10 + %tmp20 = phi i8* [ %tmp18, %bb17 ], [ null, %bb10 ], [ null, %bb10 ] + br i1 undef, label %bb21, label %bb12 + +bb21: ; preds = %bb19, %bb12, %bb12 + ret i32 0 +} + +declare i8* @foo7() + +; Check that the loop with multiple exits and a if-block branching within the loop is rotated. +; CHECK-LABEL: @bar7 +; CHECK: entry: +; CHECK: for.cond.preheader: ; preds = %entry +; CHECK: br label %for.cond.lr +; CHECK: for.cond.lr: ; preds = %for.cond.preheader +; CHECK: for.body.lr: ; preds = %for.cond.lr +; CHECK: if.end20.lr.ph: ; preds = %for.body.lr +; CHECK: if.end20: ; preds = %if.end20.lr.ph, %for.body +; CHECK: %phi.nh = phi i8* [ %call15, %for.body ], [ %call15.lr, %if.end20.lr.ph ] +; CHECK: for.cond: ; preds = %for.inc +; CHECK: br i1 false, label %for.body, label %for.cond.return.loopexit_crit_edge +; CHECK: for.cond.return.loopexit_crit_edge: ; preds = %for.cond +; CHECK: for.body: ; preds = %for.cond +; CHECK: br i1 false, label %if.end20, label %for.body.return.loopexit_crit_edge +; CHECK: for.body.return.loopexit_crit_edge: ; preds = %for.body +; CHECK: if.then23: ; preds = %crl, %if.then.i +; CHECK: %call15.lcssa = phi i8* [ %phi.nh, %crl ], [ %phi.nh, %if.then.i ] + +define void @bar7(i1 %cmp1) { +entry: + br i1 %cmp1, label %return, label %for.cond + +for.cond: ; preds = %for.inc, %entry + br i1 undef, label %for.body, label %return + +for.body: ; preds = %for.cond + %call15 = call i8* @foo7() + br i1 undef, label %if.end20, label %return + +if.end20: ; preds = %for.body + %issuer.i = getelementptr inbounds i8, i8* %call15, i64 24 + br i1 undef, label %if.then.i, label %for.cond.i + +if.then.i: ; preds = %if.end20 + br i1 undef, label %if.then23, label %crl + +for.cond.i: ; preds = %if.end20 + br i1 undef, label %crl, label %for.inc + +crl: ; preds = %if.then.i, %for.cond.i + br i1 undef, label %if.then23, label %for.inc + +if.then23: ; preds = %crl, %if.then.i + %reason = getelementptr inbounds i8, i8* %call15, i64 32 + br label %return + +for.inc: ; preds = %crl, %for.cond.i + br label %for.cond + +return: ; preds = %if.then23, %for.body, %for.cond, %if.end + ret void +} + +; Check that the loop-latch with indirect branch is not rotated. +; CHECK-LABEL: @foo8 +; CHECK-NOT: {{.*}}.lr +; CHECK-NOT: {{.*}}.lr.ph + +@f.x = internal global [3 x i8*] [i8* blockaddress(@foo8, %F), i8* blockaddress(@foo8, %G), i8* blockaddress(@foo8, %H)], align 16 + +; Function Attrs: nounwind uwtable +define i32 @foo8(i32 %i, i32 %j, i32 %k) { +entry: + %retval = alloca i32, align 4 + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + %k.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + store i32 %k, i32* %k.addr, align 4 + br label %F + +F: ; preds = %entry, %indirectgoto + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %cmp = icmp sgt i32 %0, %1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %F + br label %G + +if.else: ; preds = %F + %2 = load i32, i32* %i.addr, align 4 + %3 = load i32, i32* %k.addr, align 4 + %cmp1 = icmp sgt i32 %2, %3 + br i1 %cmp1, label %if.then2, label %if.end + +if.then2: ; preds = %if.else + br label %H + +if.end: ; preds = %if.else + br label %if.end3 + +if.end3: ; preds = %if.end + %call = call i32 (...) @z() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds [3 x i8*], [3 x i8*]* @f.x, i64 0, i64 %idxprom + %4 = load i8*, i8** %arrayidx, align 8 + br label %indirectgoto + +G: ; preds = %if.then, %indirectgoto + %call4 = call i32 (...) @g() + store i32 %call4, i32* %retval, align 4 + br label %return + +H: ; preds = %if.then2, %indirectgoto + %call5 = call i32 (...) @h() + store i32 %call5, i32* %retval, align 4 + br label %return + +return: ; preds = %H, %G + %5 = load i32, i32* %retval, align 4 + ret i32 %5 + +indirectgoto: ; preds = %if.end3 + %indirect.goto.dest = phi i8* [ %4, %if.end3 ] + indirectbr i8* %indirect.goto.dest, [label %F, label %G, label %H] +} + +declare i32 @g(...) +declare i32 @h(...) +declare i32 @z(...) Index: llvm/test/Transforms/LoopRotate/multiple-exits.ll =================================================================== --- llvm/test/Transforms/LoopRotate/multiple-exits.ll +++ llvm/test/Transforms/LoopRotate/multiple-exits.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; PR7447 +; PR7447: there should be one loop rotated, for.cond is rotated. define i32 @test1([100 x i32]* nocapture %a) nounwind readonly { entry: br label %for.cond @@ -32,16 +32,13 @@ %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ] ret i32 %retval.0 -; CHECK-LABEL: @test1( -; CHECK: for.cond1.preheader: -; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ] -; CHECK: br label %for.cond1 - -; CHECK: for.cond1: -; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ] -; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ] -; CHECK: %cmp2 = icmp ult i32 %i.1, 100 -; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit +; CHECK-LABEL: @test1 +; Check that the outer loop is rotated. +; CHECK: for.cond.lr: +; CHECK: for.cond1.preheader.lr.ph: +; CHECK: for.cond1: ; preds = %for.cond1.preheader, %land.rhs +; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %phi.nh, %for.cond1.preheader ] +; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ] } define void @test2(i32 %x) nounwind { @@ -73,11 +70,16 @@ return: ; preds = %return.loopexit, %a ret void +; Both for.cond and for.body are cloned outside of the loop and if.end is the new loop header. ; CHECK-LABEL: @test2( ; CHECK: if.end: -; CHECK: %inc = add i32 %i.02, 1 -; CHECK: %cmp = icmp eq i32 %inc, %x -; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body +; CHECK-NEXT: phi +; CHECK: %inc = add i32 %phi.nh, 1 +; CHECK: for.cond: +; CHECK-NEXT: phi +; CHECK-NEXT: %cmp = icmp eq i32 %i.0, %x +; CHECK-NEXT: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body +; CHECK: for.cond.return.loopexit_crit_edge: } declare i32 @foo(i32) Index: llvm/test/Transforms/LoopRotate/nosimplifylatch.ll =================================================================== --- llvm/test/Transforms/LoopRotate/nosimplifylatch.ll +++ llvm/test/Transforms/LoopRotate/nosimplifylatch.ll @@ -3,7 +3,9 @@ target triple = "arm64-apple-ios8.0.0" ;CHECK: for.inc: -;CHECK-NEXT: %incdec.ptr.i = getelementptr +;CHECK-NEXT: phi +;CHECK-NEXT: %incdec.ptr.i = getelementptr +;CHECK-NEXT: br ; Function Attrs: alwaysinline inlinehint nounwind readonly ssp define linkonce_odr hidden i64 @_ZNSt3__14findINS_11__wrap_iterIPiEEiEET_S4_S4_RKT0_(i64 %__first.coerce, i64 %__last.coerce, i32* nocapture readonly dereferenceable(4) %__value_) { Index: llvm/test/Transforms/LoopRotate/phi-duplicate.ll =================================================================== --- llvm/test/Transforms/LoopRotate/phi-duplicate.ll +++ llvm/test/Transforms/LoopRotate/phi-duplicate.ll @@ -31,10 +31,13 @@ ; Should only end up with one phi. ; CHECK-LABEL: define void @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label %for.body +; CHECK-NEXT: br label %for.cond.lr ; CHECK: for.body: -; CHECK-NEXT: %j.01 = phi i64 -; CHECK-NOT: br -; CHECK: br i1 %cmp, label %for.body, label %for.end -; CHECK: for.end: -; CHECK-NEXT: ret void +; CHECK-NEXT: %phi.nh = phi i64 +; CHECK-NOT: = phi +; CHECK: %inc = add nsw i64 %phi.nh, 1 +; CHECK: br label %for.cond +; CHECK: for.cond: +; CHECK: br i1 %cmp +; CHECK: for.end: +; CHECK-NEXT: ret void Index: llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll =================================================================== --- llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll +++ llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll @@ -4,6 +4,9 @@ ; structures. We manually validate the CFG with FileCheck because currently we ; can't cause a failure when LoopSimplify fails to be preserved. +; Check that inner.header and inner.body both are cloned outside the loop +; such that inner.latch becomes the new loop header. + define void @PR18643() { ; CHECK-LABEL: @PR18643( entry: @@ -16,10 +19,16 @@ ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER:[^,]*]], label %outer.body ; CHECK: [[INNER_PREROTATE_PREHEADER]]: -; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER_SPLIT_RETURN:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]] +; CHECK: br label %inner.header.lr + +; CHECK: inner.header.lr: +; CHECK: br i1 true, label %return, label %inner.body.lr + +; CHECK: inner.body.lr: +; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]] ; CHECK: [[INNER_ROTATED_PREHEADER]]: -; CHECK-NEXT: br label %inner.body +; CHECK-NEXT: br label %inner.latch inner.header: ; Now the latch! @@ -27,11 +36,17 @@ br i1 undef, label %return, label %inner.body ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_RETURN:[^,]*]], label %inner.body +; CHECK: [[INNER_SPLIT_RETURN]]: +; CHECK: br label %return + inner.body: ; Now the header! ; CHECK: inner.body: br i1 undef, label %outer.latch, label %inner.latch -; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_OUTER_LATCH:[^,]*]], label %inner.header +; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %inner.latch + +; CHECK: [[OUTER_LATCH_LOOPEXIT]]: +; CHECK-NEXT: br label %outer.latch.loopexit inner.latch: ; Dead! @@ -42,23 +57,12 @@ br label %outer.latch ; CHECK-NEXT: br label %outer.latch -; L2 -> L1 exit edge needs a simplified exit block. -; CHECK: [[INNER_SPLIT_OUTER_LATCH]]: -; CHECK-NEXT: br label %outer.latch outer.latch: ; CHECK: outer.latch: br label %outer.header ; CHECK-NEXT: br label %outer.header -; L1 -> L0 exit edge need sa simplified exit block. -; CHECK: [[INNER_PREROTATE_PREHEADER_SPLIT_RETURN]]: -; CHECK-NEXT: br label %return - -; L2 -> L0 exit edge needs a simplified exit block. -; CHECK: [[INNER_SPLIT_RETURN]]: -; CHECK-NEXT: br label %return - return: ; CHECK: return: unreachable Index: llvm/test/Transforms/LoopRotate/simplifylatch.ll =================================================================== --- llvm/test/Transforms/LoopRotate/simplifylatch.ll +++ llvm/test/Transforms/LoopRotate/simplifylatch.ll @@ -3,8 +3,13 @@ @mode_table = global [4 x i32] zeroinitializer ; <[4 x i32]*> [#uses=1] -; CHECK-LABEL: @f( -; CHECK-NOT: bb: +; Check that loop with multiple exits is rotated. +; CHECK-LABEL: @f +; CHECK: bb.lr: +; CHECK: bb2.lr: +; CHECK: bb4.lr.ph: +; CHECK: bb: + define i8 @f() { entry: tail call i32 @fegetround( ) ; :0 [#uses=1] @@ -35,42 +40,5 @@ } declare i32 @fegetround() - declare void @raise_exception() noreturn -;CHECK: for.body.lr.ph: -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0 -;CHECK-NEXT: %0 = load i8, i8* %arrayidx1, align 1 -;CHECK-NEXT: %conv2 = sext i8 %0 to i32 -;CHECK-NEXT: br label %for.body - -define i32 @foo(i8* %CurPtr, i32 %a) #0 { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.inc ] - %cmp = icmp ne i32 %i.0, %a - br i1 %cmp, label %for.body, label %return - -for.body: ; preds = %for.cond - %idxprom = zext i32 %i.0 to i64 - %arrayidx = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom - %0 = load i8, i8* %arrayidx, align 1 - %conv = sext i8 %0 to i32 - %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0 - %1 = load i8, i8* %arrayidx1, align 1 - %conv2 = sext i8 %1 to i32 - %cmp3 = icmp ne i32 %conv, %conv2 - br i1 %cmp3, label %return, label %for.inc - -for.inc: ; preds = %for.body - %inc = add i32 %i.0, 1 - br label %for.cond - -return: ; preds = %for.cond, %for.body - %retval.0 = phi i32 [ 0, %for.body ], [ 1, %for.cond ] - ret i32 %retval.0 -} - -attributes #0 = { nounwind uwtable } Index: llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll @@ -0,0 +1,98 @@ +; RUN: opt < %s -loop-rotate -S | FileCheck -check-prefix=CHECK1 %s +; REQUIRES: asserts + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; The loop rotate should not remove the metadata. +; See http://reviews.llvm.org/D3348 for details. + +; +; Test #1 +; +; Ensure that "llvm.loop.vectorize.enable" metadata was not lost after loop-rotate. +; In past LoopRotate was clearing that metadata. +; +; The source C code is: +; void rotated(float *a, int size) +; { +; int t = 0; +; #pragma omp simd +; for (int i = 0; i < size; ++i) { +; a[i] = a[i-5] * a[i+2]; +; ++t; +; } +;} + +; CHECK1: @rotated1 +; CHECK1: for.header.lr: +; CHECK1: for.body.lr.ph: +; CHECK1: for.header: +; CHECK1: br i1 %cmp2, label %for.header.for.end.loopexit_crit_edge, label %for.body, !llvm.loop !0 +; CHECK1: for.header.for.end.loopexit_crit_edge: + +define void @rotated1(float* nocapture %a, i64 %size) { +entry: + %cmp1 = icmp sgt i64 %size, 0 + br i1 %cmp1, label %for.header, label %for.end + +for.header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %cmp2 = icmp sgt i64 %indvars.iv, %size + br i1 %cmp2, label %for.end, label %for.body + +for.body: + + %0 = add nsw i64 %indvars.iv, -5 + %arrayidx = getelementptr inbounds float, float* %a, i64 %0 + %1 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %2 + %3 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %mul = fmul float %1, %3 + %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !1 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.header, !llvm.loop !1 + +for.end: + ret void +} + +!1 = !{!1, !2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} + +; CHECK1: @rotated2 +; CHECK1: loop_cond.lr: +; CHECK1: loop_inc.lr.ph: +; CHECK1: loop_inc: +; CHECK1: loop_cond: +; CHECK1: br i1 %cmp, label %loop_cond.return_crit_edge, label %loop_inc, !llvm.loop !2 +; CHECK1: loop_cond.return_crit_edge: + +; +; Test #2 +; +; Ensure that "llvm.loop.vectorize.enable" metadata was not lost due to loop rotation +; (see http://reviews.llvm.org/D3348#comment-4). +; +define i32 @rotated2(i32 %a) { +entry: + br label %loop_cond +loop_cond: + %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ] + %cmp = icmp ne i32 %indx, %a + br i1 %cmp, label %return, label %loop_inc +loop_inc: + %inc = add i32 %indx, 1 + br label %loop_cond, !llvm.loop !3 +return: + ret i32 0 +} + +!3 = !{!3, !4} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} + +; CHECK1: !0 = distinct !{!0, !1} +; CHECK1: !1 = !{!"llvm.loop.vectorize.enable", i1 true} +; CHECK1: !2 = distinct !{!2, !1} Index: llvm/test/Transforms/LoopSimplify/ashr-crash.ll =================================================================== --- llvm/test/Transforms/LoopSimplify/ashr-crash.ll +++ llvm/test/Transforms/LoopSimplify/ashr-crash.ll @@ -29,9 +29,9 @@ ; CHECK-LABEL: entry: ; CHECK-LABEL: for.cond1.preheader: ; CHECK-LABEL: for.body3: -; CHECK: %cmp4.le.le -; CHECK: %conv.le.le = zext i1 %cmp4.le.le to i32 -; CHECK: %xor.le.le = xor i32 %conv6.le.le, 1 +; CHECK: %cmp4 +; CHECK: %conv = zext i1 %cmp4 to i32 +; CHECK: %xor = xor i32 %conv6, 1 define void @foo() { entry: br label %for.cond Index: llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll +++ llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll @@ -3,8 +3,6 @@ ; Loop from "rotated" ; CHECK: LV: Loop hints: force=enabled -; Loop from "nonrotated" -; CHECK: LV: Loop hints: force=enabled ; No more loops in the module ; CHECK-NOT: LV: Loop hints: force= ; In total only 1 loop should be rotated. @@ -63,25 +61,3 @@ !1 = !{!1, !2} !2 = !{!"llvm.loop.vectorize.enable", i1 true} -; -; Test #2 -; -; Ensure that "llvm.loop.vectorize.enable" metadata was not lost even -; if loop was not rotated (see http://reviews.llvm.org/D3348#comment-4). -; -define i32 @nonrotated(i32 %a) { -entry: - br label %loop_cond -loop_cond: - %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ] - %cmp = icmp ne i32 %indx, %a - br i1 %cmp, label %return, label %loop_inc -loop_inc: - %inc = add i32 %indx, 1 - br label %loop_cond, !llvm.loop !3 -return: - ret i32 0 -} - -!3 = !{!3, !4} -!4 = !{!"llvm.loop.vectorize.enable", i1 true}