diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -46,6 +46,11 @@ STATISTIC(NumRotated, "Number of loops rotated"); +static cl::opt + MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden, + cl::desc("Allow loop rotation multiple times in order to reach " + "a better latch exit")); + namespace { /// A simple loop rotation transformation. class LoopRotate { @@ -177,14 +182,16 @@ } } -// Look for a phi which is only used outside the loop (via a LCSSA phi) -// in the exit from the header. This means that rotating the loop can -// remove the phi. -static bool shouldRotateLoopExitingLatch(Loop *L) { +// Assuming both header and latch are exiting, look for a phi which is only +// used outside the loop (via a LCSSA phi) in the exit from the header. +// This means that rotating the loop can remove the phi. +static bool profitableToRotateLoopExitingLatch(Loop *L) { BasicBlock *Header = L->getHeader(); - BasicBlock *HeaderExit = Header->getTerminator()->getSuccessor(0); + BranchInst *BI = dyn_cast(Header->getTerminator()); + assert(BI && BI->isConditional() && "need header with conditional exit"); + BasicBlock *HeaderExit = BI->getSuccessor(0); if (L->contains(HeaderExit)) - HeaderExit = Header->getTerminator()->getSuccessor(1); + HeaderExit = BI->getSuccessor(1); for (auto &Phi : Header->phis()) { // Look for uses of this phi in the loop/via exits other than the header. @@ -194,7 +201,50 @@ continue; return true; } + return false; +} + +// Check that latch exit is deoptimizing (which means - very unlikely to happen) +// and there is another exit from the loop which is non-deoptimizing. +// If we rotate latch to that exit our loop has a better chance of being fully +// canonical. +// +// It can give false positives in some rare cases. +static bool canRotateDeoptimizingLatchExit(Loop *L) { + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "need latch"); + BranchInst *BI = dyn_cast(Latch->getTerminator()); + // Need normal exiting latch. + if (!BI || !BI->isConditional()) + return false; + + BasicBlock *Exit = BI->getSuccessor(1); + if (L->contains(Exit)) + Exit = BI->getSuccessor(0); + // Latch exit is non-deoptimizing, no need to rotate. + if (!Exit->getPostdominatingDeoptimizeCall()) + return false; + + SmallVector Exits; + L->getUniqueExitBlocks(Exits); + if (!Exits.empty()) { + // There is at least one non-deoptimizing exit. + // + // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact, + // as it can conservatively return false for deoptimizing exits with + // complex enough control flow down to deoptimize call. + // + // That means here we can report success for a case where + // all exits are deoptimizing but one of them has complex enough + // control flow (e.g. with loops). + // + // That should be a very rare case and false positives for this function + // have compile-time effect only. + return any_of(Exits, [](const BasicBlock *BB) { + return !BB->getPostdominatingDeoptimizeCall(); + }); + } return false; } @@ -208,319 +258,336 @@ /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. +/// +/// If -loop-rotate-multi is enabled we can do multiple rotations in one go +/// so to reach a suitable (non-deoptimizing) exit. bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); - - BranchInst *BI = dyn_cast(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) - return false; - - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) - return false; - - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) - return false; - - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. Or if we think it will be profitable. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && - !shouldRotateLoopExitingLatch(L)) - return false; - - // Check size of original header and reject loop if it is very big or we can't - // duplicate blocks inside it. - { - SmallPtrSet EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); - if (Metrics.notDuplicatable) { - LLVM_DEBUG( - dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; - L->dump()); - return false; + bool Rotated = false; + do { + BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + BranchInst *BI = dyn_cast(OrigHeader->getTerminator()); + if (!BI || BI->isUnconditional()) + return Rotated; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return Rotated; + + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (!OrigLatch) + return Rotated; + + // Rotate if either the loop latch does *not* exit the loop, or if the loop + // latch was just simplified. Or if we think it will be profitable. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && + !profitableToRotateLoopExitingLatch(L) && + !canRotateDeoptimizingLatchExit(L)) + return Rotated; + + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. + { + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); + if (Metrics.notDuplicatable) { + LLVM_DEBUG( + dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" + << " instructions: "; + L->dump()); + return Rotated; + } + if (Metrics.convergent) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); + return Rotated; + } + if (Metrics.NumInsts > MaxHeaderSize) + return Rotated; } - if (Metrics.convergent) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " - "instructions: "; - L->dump()); - return false; + + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!OrigPreheader || !L->hasDedicatedExits()) + return Rotated; + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. We should also invalidate + // all outer loops because insertion and deletion of blocks that happens + // during the rotation may violate invariants related to backedge taken + // infos in them. + if (SE) + SE->forgetTopmostLoop(L); + + LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + if (L->contains(Exit)) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap, ValueMapMSSA; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast(I); ++I) + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. + using DbgIntrinsicHash = + std::pair, DIExpression *>; + auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash { + return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; + }; + SmallDenseSet DbgIntrinsics; + for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); + I != E; ++I) { + if (auto *DII = dyn_cast(&*I)) + DbgIntrinsics.insert(makeHash(DII)); + else + break; } - if (Metrics.NumInsts > MaxHeaderSize) - return false; - } - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); + while (I != E) { + Instruction *Inst = &*I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !Inst->isTerminator() && + !isa(Inst) && !isa(Inst)) { + Inst->moveBefore(LoopEntryBranch); + continue; + } - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader || !L->hasDedicatedExits()) - return false; + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. We should also invalidate - // all outer loops because insertion and deletion of blocks that happens - // during the rotation may violate invariants related to backedge taken - // infos in them. - if (SE) - SE->forgetTopmostLoop(L); + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - if (L->contains(Exit)) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap, ValueMapMSSA; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast(I); ++I) - InsertNewValueIntoMap(ValueMap, PN, - PN->getIncomingValueForBlock(OrigPreheader)); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); - - // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. - using DbgIntrinsicHash = - std::pair, DIExpression *>; - auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash { - return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; - }; - SmallDenseSet DbgIntrinsics; - for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); - I != E; ++I) { - if (auto *DII = dyn_cast(&*I)) - DbgIntrinsics.insert(makeHash(DII)); - else - break; - } + // Avoid inserting the same intrinsic twice. + if (auto *DII = dyn_cast(C)) + if (DbgIntrinsics.count(makeHash(DII))) { + C->deleteValue(); + continue; + } - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !Inst->isTerminator() && - !isa(Inst) && !isa(Inst)) { - Inst->moveBefore(LoopEntryBranch); - continue; + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = SimplifyInstruction(C, SQ); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + InsertNewValueIntoMap(ValueMap, Inst, V); + if (!C->mayHaveSideEffects()) { + C->deleteValue(); + C = nullptr; + } + } else { + InsertNewValueIntoMap(ValueMap, Inst, C); + } + if (C) { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + C->insertBefore(LoopEntryBranch); + + if (auto *II = dyn_cast(C)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); + // MemorySSA cares whether the cloned instruction was inserted or not, and + // not whether it can be remapped to a simplified value. + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + } } - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + for (BasicBlock *SuccBB : successors(OrigHeader)) + for (BasicBlock::iterator BI = SuccBB->begin(); + PHINode *PN = dyn_cast(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + + // Update MemorySSA before the rewrite call below changes the 1:1 + // instruction:cloned_instruction_or_value mapping. + if (MSSAU) { + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); + MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, + ValueMapMSSA); + } - // Avoid inserting the same intrinsic twice. - if (auto *DII = dyn_cast(C)) - if (DbgIntrinsics.count(makeHash(DII))) { - C->deleteValue(); - continue; + SmallVector InsertedPHIs; + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, + &InsertedPHIs); + + // Attach dbg.value intrinsics to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector Updates; + Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); + Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); + Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); + DT->applyUpdates(Updates); + + if (MSSAU) { + MSSAU->applyUpdates(Updates, *DT); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); } + } - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - Value *V = SimplifyInstruction(C, SQ); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - InsertNewValueIntoMap(ValueMap, Inst, V); - if (!C->mayHaveSideEffects()) { - C->deleteValue(); - C = nullptr; + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + if (!isa(PHBI->getCondition()) || + PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != + NewHeader) { + // The conditional branch can't be folded, handle the general case. + // Split edges as necessary to preserve LoopSimplify form. + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector ExitPreds(pred_begin(Exit), pred_end(Exit)); + bool SplitLatchEdge = false; + for (BasicBlock *ExitPred : ExitPreds) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(ExitPred); + if (!PredLoop || PredLoop->contains(Exit) || + ExitPred->getTerminator()->isIndirectTerminator()) + continue; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; + BasicBlock *ExitSplit = SplitCriticalEdge( + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + ExitSplit->moveBefore(Exit); } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); } else { - InsertNewValueIntoMap(ValueMap, Inst, C); - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - C->insertBefore(LoopEntryBranch); - - if (auto *II = dyn_cast(C)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); - // MemorySSA cares whether the cloned instruction was inserted or not, and - // not whether it can be remapped to a simplified value. + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DT) DT->deleteEdge(OrigPreheader, Exit); + + // Update MSSA too, if available. if (MSSAU) - InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + MSSAU->removeEdge(OrigPreheader, Exit); } - } - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - for (BasicBlock *SuccBB : successors(OrigHeader)) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - - // Update MemorySSA before the rewrite call below changes the 1:1 - // instruction:cloned_instruction_or_value mapping. - if (MSSAU) { - InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); - MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, - ValueMapMSSA); - } + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); - SmallVector InsertedPHIs; - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, - &InsertedPHIs); - - // Attach dbg.value intrinsics to the new phis if that phi uses a value that - // previously had debug metadata attached. This keeps the debug info - // up-to-date in the loop body. - if (!InsertedPHIs.empty()) - insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // Inform DT about changes to the CFG. - if (DT) { - // The OrigPreheader branches to the NewHeader and Exit now. Then, inform - // the DT about the removed edge to the OrigHeader (that got removed). - SmallVector Updates; - Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); - Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); - Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); - DT->applyUpdates(Updates); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - if (MSSAU) { - MSSAU->applyUpdates(Updates, *DT); - if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - } - } + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - if (!isa(PHBI->getCondition()) || - PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != - NewHeader) { - // The conditional branch can't be folded, handle the general case. - // Split edges as necessary to preserve LoopSimplify form. - - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector ExitPreds(pred_begin(Exit), pred_end(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit) || - ExitPred->getTerminator()->isIndirectTerminator()) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); - } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); - - // With our CFG finalized, update DomTree if it is available. - if (DT) DT->deleteEdge(OrigPreheader, Exit); - - // Update MSSA too, if available. - if (MSSAU) - MSSAU->removeEdge(OrigPreheader, Exit); - } + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + ++NumRotated; - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); + Rotated = true; + SimplifiedLatch = false; - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + // Check that new latch is a deoptimizing exit and then repeat rotation if possible. + // Deoptimizing latch exit is not a generally typical case, so we just loop over. + // TODO: if it becomes a performance bottleneck extend rotation algorithm + // to handle multiple rotations in one go. + } while (MultiRotate && canRotateDeoptimizingLatchExit(L)); - LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); - ++NumRotated; return true; } diff --git a/llvm/test/Transforms/LoopRotate/multiple-deopt-exits.ll b/llvm/test/Transforms/LoopRotate/multiple-deopt-exits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopRotate/multiple-deopt-exits.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -loop-rotate -loop-rotate-multi=true | FileCheck %s +; RUN: opt -S < %s -passes='loop(rotate)' -loop-rotate-multi=true | FileCheck %s + +; Test loop rotation with multiple exits, some of them - deoptimizing. +; We should end up with a latch which exit is non-deoptimizing, so we should rotate +; more than once. + +declare i32 @llvm.experimental.deoptimize.i32(...) + +define i32 @test_cond_with_one_deopt_exit(i32 * nonnull %a, i64 %x) { +; Rotation done twice. +; Latch should be at the 2nd condition (for.cond2), exiting to %return. +; +; CHECK-LABEL: @test_cond_with_one_deopt_exit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VAL_A_IDX3:%.*]] = load i32, i32* %a, align 4 +; CHECK-NEXT: [[ZERO_CHECK4:%.*]] = icmp eq i32 [[VAL_A_IDX3]], 0 +; CHECK-NEXT: br i1 [[ZERO_CHECK4]], label %deopt.exit, label %for.cond2.lr.ph +; CHECK: for.cond2.lr.ph: +; CHECK-NEXT: [[FOR_CHECK8:%.*]] = icmp ult i64 0, %x +; CHECK-NEXT: br i1 [[FOR_CHECK8]], label %for.body.lr.ph, label %return +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label %for.body +; CHECK: for.cond2: +; CHECK: [[FOR_CHECK:%.*]] = icmp ult i64 {{%.*}}, %x +; CHECK-NEXT: br i1 [[FOR_CHECK]], label %for.body, label %for.cond2.return_crit_edge +; CHECK: for.body: +; CHECK: br label %for.tail +; CHECK: for.tail: +; CHECK: [[VAL_A_IDX:%.*]] = load i32, i32* +; CHECK-NEXT: [[ZERO_CHECK:%.*]] = icmp eq i32 [[VAL_A_IDX]], 0 +; CHECK-NEXT: br i1 [[ZERO_CHECK]], label %for.cond1.deopt.exit_crit_edge, label %for.cond2 +; CHECK: for.cond2.return_crit_edge: +; CHECK-NEXT: {{%.*}} = phi i32 +; CHECK-NEXT: br label %return +; CHECK: return: +; CHECK-NEXT: [[SUM_LCSSA2:%.*]] = phi i32 +; CHECK-NEXT: ret i32 [[SUM_LCSSA2]] +; CHECK: for.cond1.deopt.exit_crit_edge: +; CHECK-NEXT: {{%.*}} = phi i32 +; CHECK-NEXT: br label %deopt.exit +; CHECK: deopt.exit: +; CHECK: [[DEOPT_VAL:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 {{%.*}}) ] +; CHECK-NEXT: ret i32 [[DEOPT_VAL]] +; +entry: + br label %for.cond1 + +for.cond1: + %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.tail ] + %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.tail ] + %a.idx = getelementptr inbounds i32, i32 *%a, i64 %idx + %val.a.idx = load i32, i32* %a.idx, align 4 + %zero.check = icmp eq i32 %val.a.idx, 0 + br i1 %zero.check, label %deopt.exit, label %for.cond2 + +for.cond2: + %for.check = icmp ult i64 %idx, %x + br i1 %for.check, label %for.body, label %return + +for.body: + br label %for.tail + +for.tail: + %sum.next = add i32 %sum, %val.a.idx + %idx.next = add nuw nsw i64 %idx, 1 + br label %for.cond1 + +return: + ret i32 %sum + +deopt.exit: + %deopt.val = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %val.a.idx) ] + ret i32 %deopt.val +} + +define i32 @test_cond_with_two_deopt_exits(i32 ** nonnull %a, i64 %x) { +; Rotation done three times. +; Latch should be at the 3rd condition (for.cond3), exiting to %return. +; +; CHECK-LABEL: @test_cond_with_two_deopt_exits( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_IDX_DEREF4:%.*]] = load i32*, i32** %a +; CHECK-NEXT: [[NULL_CHECK5:%.*]] = icmp eq i32* [[A_IDX_DEREF4]], null +; CHECK-NEXT: br i1 [[NULL_CHECK5]], label %deopt.exit1, label %for.cond2.lr.ph +; CHECK: for.cond2.lr.ph: +; CHECK-NEXT: [[VAL_A_IDX9:%.*]] = load i32, i32* [[A_IDX_DEREF4]], align 4 +; CHECK-NEXT: [[ZERO_CHECK10:%.*]] = icmp eq i32 [[VAL_A_IDX9]], 0 +; CHECK-NEXT: br i1 [[ZERO_CHECK10]], label %deopt.exit2, label %for.cond3.lr.ph +; CHECK: for.cond3.lr.ph: +; CHECK-NEXT: [[FOR_CHECK14:%.*]] = icmp ult i64 0, %x +; CHECK-NEXT: br i1 [[FOR_CHECK14]], label %for.body.lr.ph, label %return +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label %for.body +; CHECK: for.cond2: +; CHECK: [[VAL_A_IDX:%.*]] = load i32, i32* +; CHECK-NEXT: [[ZERO_CHECK:%.*]] = icmp eq i32 [[VAL_A_IDX]], 0 +; CHECK-NEXT: br i1 [[ZERO_CHECK]], label %for.cond2.deopt.exit2_crit_edge, label %for.cond3 +; CHECK: for.cond3: +; CHECK: [[FOR_CHECK:%.*]] = icmp ult i64 {{%.*}}, %x +; CHECK-NEXT: br i1 [[FOR_CHECK]], label %for.body, label %for.cond3.return_crit_edge +; CHECK: for.body: +; CHECK: br label %for.tail +; CHECK: for.tail: +; CHECK: [[IDX_NEXT:%.*]] = add nuw nsw i64 {{%.*}}, 1 +; CHECK: [[NULL_CHECK:%.*]] = icmp eq i32* {{%.*}}, null +; CHECK-NEXT: br i1 [[NULL_CHECK]], label %for.cond1.deopt.exit1_crit_edge, label %for.cond2 +; CHECK: for.cond3.return_crit_edge: +; CHECK-NEXT: [[SPLIT18:%.*]] = phi i32 +; CHECK-NEXT: br label %return +; CHECK: return: +; CHECK-NEXT: [[SUM_LCSSA2:%.*]] = phi i32 +; CHECK-NEXT: ret i32 [[SUM_LCSSA2]] +; CHECK: for.cond1.deopt.exit1_crit_edge: +; CHECK-NEXT: br label %deopt.exit1 +; CHECK: deopt.exit1: +; CHECK-NEXT: [[DEOPT_VAL1:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ] +; CHECK-NEXT: ret i32 [[DEOPT_VAL1]] +; CHECK: for.cond2.deopt.exit2_crit_edge: +; CHECK-NEXT: [[SPLIT:%.*]] = phi i32 +; CHECK-NEXT: br label %deopt.exit2 +; CHECK: deopt.exit2: +; CHECK-NEXT: [[VAL_A_IDX_LCSSA:%.*]] = phi i32 +; CHECK-NEXT: [[DEOPT_VAL2:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 [[VAL_A_IDX_LCSSA]]) ] +; CHECK-NEXT: ret i32 [[DEOPT_VAL2]] +; +entry: + br label %for.cond1 + +for.cond1: + %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.tail ] + %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.tail ] + %a.idx = getelementptr inbounds i32*, i32 **%a, i64 %idx + %a.idx.deref = load i32*, i32** %a.idx + %null.check = icmp eq i32* %a.idx.deref, null + br i1 %null.check, label %deopt.exit1, label %for.cond2 + +for.cond2: + %val.a.idx = load i32, i32* %a.idx.deref, align 4 + %zero.check = icmp eq i32 %val.a.idx, 0 + br i1 %zero.check, label %deopt.exit2, label %for.cond3 + +for.cond3: + %for.check = icmp ult i64 %idx, %x + br i1 %for.check, label %for.body, label %return + +for.body: + br label %for.tail + +for.tail: + %sum.next = add i32 %sum, %val.a.idx + %idx.next = add nuw nsw i64 %idx, 1 + br label %for.cond1 + +return: + ret i32 %sum + +deopt.exit1: + %deopt.val1 = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ] + ret i32 %deopt.val1 +deopt.exit2: + %deopt.val2 = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %val.a.idx) ] + ret i32 %deopt.val2 +} diff --git a/llvm/unittests/Transforms/Utils/CMakeLists.txt b/llvm/unittests/Transforms/Utils/CMakeLists.txt --- a/llvm/unittests/Transforms/Utils/CMakeLists.txt +++ b/llvm/unittests/Transforms/Utils/CMakeLists.txt @@ -15,6 +15,7 @@ FunctionComparatorTest.cpp IntegerDivisionTest.cpp LocalTest.cpp + LoopRotationUtilsTest.cpp LoopUtilsTest.cpp SizeOptsTest.cpp SSAUpdaterBulkTest.cpp diff --git a/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp b/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Transforms/Utils/LoopRotationUtilsTest.cpp @@ -0,0 +1,166 @@ +//===- LoopRotationUtilsTest.cpp - Unit tests for LoopRotation utility ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/LoopRotationUtils.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; + +static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + std::unique_ptr Mod = parseAssemblyString(IR, Err, C); + if (!Mod) + Err.print("LoopRotationUtilsTest", errs()); + return Mod; +} + +/// This test contains multi-deopt-exits pattern that might allow loop rotation +/// to trigger multiple times if multiple rotations are enabled. +/// At least one rotation should be performed, no matter what loop rotation settings are. +TEST(LoopRotate, MultiDeoptExit) { + LLVMContext C; + + std::unique_ptr M = parseIR( + C, + R"( +declare i32 @llvm.experimental.deoptimize.i32(...) + +define i32 @test(i32 * nonnull %a, i64 %x) { +entry: + br label %for.cond1 + +for.cond1: + %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.tail ] + %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.tail ] + %a.idx = getelementptr inbounds i32, i32 *%a, i64 %idx + %val.a.idx = load i32, i32* %a.idx, align 4 + %zero.check = icmp eq i32 %val.a.idx, 0 + br i1 %zero.check, label %deopt.exit, label %for.cond2 + +for.cond2: + %for.check = icmp ult i64 %idx, %x + br i1 %for.check, label %for.body, label %return + +for.body: + br label %for.tail + +for.tail: + %sum.next = add i32 %sum, %val.a.idx + %idx.next = add nuw nsw i64 %idx, 1 + br label %for.cond1 + +return: + ret i32 %sum + +deopt.exit: + %deopt.val = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %val.a.idx) ] + ret i32 %deopt.val +})" + ); + + auto *F = M->getFunction("test"); + DominatorTree DT(*F); + LoopInfo LI(DT); + AssumptionCache AC(*F); + TargetTransformInfo TTI(M->getDataLayout()); + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI(TLII); + ScalarEvolution SE(*F, TLI, AC, DT, LI); + SimplifyQuery SQ(M->getDataLayout()); + + Loop *L = *LI.begin(); + + bool ret = LoopRotation(L, &LI, &TTI, + &AC, &DT, + &SE, nullptr, + SQ, true, -1, false); + EXPECT_TRUE(ret); +} + +/// Checking a special case of multi-deopt exit loop that can not perform +/// required amount of rotations due to the desired header containing +/// non-duplicatable code. +/// Similar to MultiDeoptExit test this one should do at least one rotation and +/// pass no matter what loop rotation settings are. +TEST(LoopRotate, MultiDeoptExit_Nondup) { + LLVMContext C; + + std::unique_ptr M = parseIR( + C, + R"( +; Rotation should be done once, attempted twice. +; Second time fails due to non-duplicatable header. + +declare i32 @llvm.experimental.deoptimize.i32(...) + +declare void @nondup() + +define i32 @test_nondup(i32 * nonnull %a, i64 %x) { +entry: + br label %for.cond1 + +for.cond1: + %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.tail ] + %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.tail ] + %a.idx = getelementptr inbounds i32, i32 *%a, i64 %idx + %val.a.idx = load i32, i32* %a.idx, align 4 + %zero.check = icmp eq i32 %val.a.idx, 0 + br i1 %zero.check, label %deopt.exit, label %for.cond2 + +for.cond2: + call void @nondup() noduplicate + %for.check = icmp ult i64 %idx, %x + br i1 %for.check, label %for.body, label %return + +for.body: + br label %for.tail + +for.tail: + %sum.next = add i32 %sum, %val.a.idx + %idx.next = add nuw nsw i64 %idx, 1 + br label %for.cond1 + +return: + ret i32 %sum + +deopt.exit: + %deopt.val = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %val.a.idx) ] + ret i32 %deopt.val +})" + ); + + auto *F = M->getFunction("test_nondup"); + DominatorTree DT(*F); + LoopInfo LI(DT); + AssumptionCache AC(*F); + TargetTransformInfo TTI(M->getDataLayout()); + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI(TLII); + ScalarEvolution SE(*F, TLI, AC, DT, LI); + SimplifyQuery SQ(M->getDataLayout()); + + Loop *L = *LI.begin(); + + bool ret = LoopRotation(L, &LI, &TTI, + &AC, &DT, + &SE, nullptr, + SQ, true, -1, false); + /// LoopRotation should properly report "true" as we still perform the first rotation + /// so we do change the IR. + EXPECT_TRUE(ret); +}