diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -414,6 +414,7 @@ void initializeTypeBasedAAWrapperPassPass(PassRegistry&); void initializeTypePromotionPass(PassRegistry&); void initializeUnifyFunctionExitNodesPass(PassRegistry&); +void initializeUnifyLoopExitsPass(PassRegistry &); void initializeUnpackMachineBundlesPass(PassRegistry&); void initializeUnreachableBlockElimLegacyPassPass(PassRegistry&); void initializeUnreachableMachineBlockElimPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -229,7 +229,8 @@ (void) llvm::createScalarizeMaskedMemIntrinPass(); (void) llvm::createWarnMissedTransformationsPass(); (void) llvm::createHardwareLoopsPass(); - (void)llvm::createInjectTLIMappingsLegacyPass(); + (void) llvm::createInjectTLIMappingsLegacyPass(); + (void) llvm::createUnifyLoopExitsPass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); diff --git a/llvm/include/llvm/Transforms/Utils.h b/llvm/include/llvm/Transforms/Utils.h --- a/llvm/include/llvm/Transforms/Utils.h +++ b/llvm/include/llvm/Transforms/Utils.h @@ -126,6 +126,14 @@ // scalar-to-vector mappings from the TargetLibraryInfo. // FunctionPass *createInjectTLIMappingsLegacyPass(); + +//===----------------------------------------------------------------------===// +// +// UnifyLoopExits - For each loop, creates a new block N such that all exiting +// blocks branch to N, and then N distributes control flow to all the original +// exit blocks. +// +FunctionPass *createUnifyLoopExitsPass(); } #endif diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -17,6 +17,7 @@ // FIXME: Move to this file: BasicBlock::removePredecessor, BB::splitBasicBlock #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -363,6 +364,77 @@ BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr); +/// Given a set of incoming and outgoing blocks, create a "hub" such that every +/// edge from an incoming block InBB to an outgoing block OutBB is now split +/// into two edges, one from InBB to the hub and another from the hub to +/// OutBB. The hub consists of a series of guard blocks, one for each outgoing +/// block. Each guard block conditionally branches to the corresponding outgoing +/// block, or the next guard block in the chain. These guard blocks are returned +/// in the argument vector. +/// +/// Since the control flow edges from InBB to OutBB have now been replaced, the +/// function also updates any PHINodes in OutBB. For each such PHINode, the +/// operands corresponding to incoming blocks are moved to a new PHINode in the +/// hub, and the hub is made an operand of the original PHINode. +/// +/// Input CFG: +/// ---------- +/// +/// Def +/// | +/// v +/// In1 In2 +/// | | +/// | | +/// v v +/// Foo ---> Out1 Out2 +/// | +/// v +/// Use +/// +/// +/// Create hub: Incoming = {In1, In2}, Outgoing = {Out1, Out2} +/// ---------------------------------------------------------- +/// +/// Def +/// | +/// v +/// In1 In2 Foo +/// | Hub | | +/// | + - - | - - + | +/// | ' v ' V +/// +------> Guard1 -----> Out1 +/// ' | ' +/// ' v ' +/// ' Guard2 -----> Out2 +/// ' ' | +/// + - - - - - + | +/// v +/// Use +/// +/// Limitation: +/// ----------- +/// The updates to the PHINodes are not sufficient to restore SSA form. Consider +/// a definition Def, its use Use, incoming block In2 and outgoing block Out2, +/// such that: +/// 1. In2 is reachable from D or contains D. +/// 2. U is reachable from Out2 or is contained in Out2. +/// 3. U is not a PHINode if U is contained in Out2. +/// +/// Clearly, Def dominates Out2 since the program is valid SSA. But when the hub +/// is introduced, there is a new path through the hub along which Use is +/// reachable from entry without passing through Def, and SSA is no longer +/// valid. To fix this, we need to look at all the blocks post-dominated by the +/// hub on the one hand, and dominated by Out2 on the other. This is left for +/// the caller to accomplish, since each specific use of this function may have +/// additional information which simplifies this fixup. For example, see +/// restoreSSA() in the UnifyLoopExits pass. +BasicBlock *CreateControlFlowHub(DomTreeUpdater *DTU, + SmallVectorImpl &GuardBlocks, + const SetVector &Predecessors, + const SetVector &Successors, + const StringRef Prefix); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -533,6 +533,13 @@ /// value? bool canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx); +//===----------------------------------------------------------------------===// +// Value helper functions +// + +/// Invert the given true/false value, possibly reusing an existing copy. +Value *invertCondition(Value *Condition); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_LOCAL_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -184,6 +184,11 @@ cl::init(true), cl::Hidden); +static cl::opt EnableStructurizerWorkarounds( + "amdgpu-enable-structurizer-workarounds", + cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(false), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -843,6 +848,9 @@ // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); if (!LateCFGStructurize) { + if (EnableStructurizerWorkarounds) { + addPass(createUnifyLoopExitsPass()); + } addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions } addPass(createSinkingPass()); diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -44,6 +44,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include #include @@ -218,8 +219,6 @@ void analyzeLoops(RegionNode *N); - Value *invert(Value *Condition); - Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); void gatherPredicates(RegionNode *N); @@ -405,39 +404,6 @@ } } -/// Invert the given condition -Value *StructurizeCFG::invert(Value *Condition) { - // First: Check if it's a constant - if (Constant *C = dyn_cast(Condition)) - return ConstantExpr::getNot(C); - - // Second: If the condition is already inverted, return the original value - Value *NotCondition; - if (match(Condition, m_Not(m_Value(NotCondition)))) - return NotCondition; - - if (Instruction *Inst = dyn_cast(Condition)) { - // Third: Check all the users for an invert - BasicBlock *Parent = Inst->getParent(); - for (User *U : Condition->users()) - if (Instruction *I = dyn_cast(U)) - if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) - return I; - - // Last option: Create a new instruction - return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); - } - - if (Argument *Arg = dyn_cast(Condition)) { - BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock(); - return BinaryOperator::CreateNot(Condition, - Arg->getName() + ".inv", - EntryBlock.getTerminator()); - } - - llvm_unreachable("Unhandled condition to invert"); -} - /// Build the condition for one edge Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, bool Invert) { @@ -446,7 +412,7 @@ Cond = Term->getCondition(); if (Idx != (unsigned)Invert) - Cond = invert(Cond); + Cond = invertCondition(Cond); } return Cond; } diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -1102,3 +1102,278 @@ } return BI->getCondition(); } + +// After creating a control flow hub, the operands of PHINodes in an outgoing +// block Out no longer match the predecessors of that block. Predecessors of Out +// that are incoming blocks to the hub are now replaced by just one edge from +// the hub. To match this new control flow, the corresponding values from each +// PHINode must now be moved a new PHINode in the first guard block of the hub. +// +// This operation cannot be performed with SSAUpdater, because it involves one +// new use: If the block Out is in the list of Incoming blocks, then the newly +// created PHI in the Hub will use itself along that edge from Out to Hub. +static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock, + const SetVector &Incoming, + BasicBlock *FirstGuardBlock) { + auto I = Out->begin(); + while (I != Out->end() && isa(I)) { + auto Phi = cast(I); + auto NewPhi = + PHINode::Create(Phi->getType(), Incoming.size(), + Phi->getName() + ".moved", &FirstGuardBlock->back()); + for (auto In : Incoming) { + Value *V = UndefValue::get(Phi->getType()); + if (In == Out) { + V = NewPhi; + } else if (Phi->getBasicBlockIndex(In) != -1) { + V = Phi->removeIncomingValue(In, false); + } + NewPhi->addIncoming(V, In); + } + assert(NewPhi->getNumIncomingValues() == Incoming.size()); + if (Phi->getNumOperands() == 0) { + Phi->replaceAllUsesWith(NewPhi); + I = Phi->eraseFromParent(); + continue; + } + Phi->addIncoming(NewPhi, GuardBlock); + ++I; + } +} + +using BBPredicates = DenseMap; +using PredMap = DenseMap; +using BBSetVector = SetVector; + +// Collect predicates for each outgoing block: +// OutBB x InBB -> Value +// +// These are used by the Hub to transfer control to the outgoing blocks. +static void collectPredicates(PredMap &Predicates, + SmallVectorImpl &Inverted, + const BBSetVector &Incoming, + const BBSetVector &Outgoing) { + auto &Context = Incoming.front()->getContext(); + auto BoolTrue = ConstantInt::getTrue(Context); + + for (auto Out : Outgoing) { + for (auto In : predecessors(Out)) { + if (!Incoming.count(In)) + continue; + auto Branch = cast(In->getTerminator()); + if (!Branch->isConditional()) { + Predicates[Out][In] = BoolTrue; + continue; + } + auto Condition = Branch->getCondition(); + if (Branch->getSuccessor(0) == Out) { + Predicates[Out][In] = Condition; + continue; + } + auto Inv = invertCondition(Condition); + Predicates[Out][In] = Inv; + Inverted.push_back(Inv); + Inverted.push_back(Condition); + } + } + + // Optimization: For each incoming block with a conditional branch, where only + // one successor is in an outgoing block, the predicate is always true. + for (auto In : Incoming) { + auto Branch = cast(In->getTerminator()); + if (!Branch->isConditional()) + continue; + auto Succ0 = Branch->getSuccessor(0); + auto Succ1 = Branch->getSuccessor(1); + if (Outgoing.count(Succ0) && !Outgoing.count(Succ1)) { + Predicates[Succ0][In] = BoolTrue; + } else if (Outgoing.count(Succ1) && !Outgoing.count(Succ0)) { + Predicates[Succ1][In] = BoolTrue; + } + } +} + +static void redirectIncomingToHub(const BBSetVector &Incoming, + const BBSetVector &Outgoing, + BasicBlock *FirstGuardBlock) { + // Replace the appropriate successors of each incoming block with the Hub. + for (auto In : Incoming) { + auto Branch = cast(In->getTerminator()); + if (!Branch->isConditional()) { + Branch->setSuccessor(0, FirstGuardBlock); + continue; + } + auto Succ0 = Branch->getSuccessor(0); + auto Succ1 = Branch->getSuccessor(1); + if (Outgoing.count(Succ0)) { + if (Outgoing.count(Succ1)) { + Branch->eraseFromParent(); + BranchInst::Create(FirstGuardBlock, In); + continue; + } + Branch->setSuccessor(0, FirstGuardBlock); + continue; + } + assert(Outgoing.count(Succ1)); + Branch->setSuccessor(1, FirstGuardBlock); + } +} + +// Create one guard predicate in FirstGuardBlock for each outgoing block, that +// decides whether control should flow to that block. This guard is a Boolean +// PHINode over the predicates previously stored for that block. +static void createGuardPredicates(BBPredicates &GuardPredicates, + PredMap &OldPredicates, + const BBSetVector &Incoming, + const BBSetVector &Outgoing, + BasicBlock *FirstGuardBlock) { + auto &Context = Incoming.front()->getContext(); + auto BoolTrue = ConstantInt::getTrue(Context); + auto BoolFalse = ConstantInt::getFalse(Context); + + // The predicate for the last successor is trivially true, and so we process + // only the first N-1 successors. + for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { + auto Out = Outgoing[i]; + LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n"); + auto Phi = + PHINode::Create(Type::getInt1Ty(Context), Incoming.size(), + StringRef("Guard.") + Out->getName(), FirstGuardBlock); + GuardPredicates[Out] = Phi; + + // Optimization: Consider an incoming block A with both successors B1 and B2 + // in the set of outgoing blocks. The predicates for B1 and B2 complement + // each other. If B1 is visited first in the loop below, control will branch + // to B1 using the corresponding predicate. But if that branch is not taken, + // then control must reach B2, which means that the predicate for B2 is + // always true. This optimization is performed as follows: + // 1. At each iteration, drop the predicates for the current outgoing block. + // 2. If the current block has a non-constant predicate, then it's sibling + // has not been visited yet. Locate the sibling by matching the incoming + // block in the remaining predicates. Only one other outgoing block can + // have the same incoming block as a predecessor. + // 3. Unconditionally set the predicate of the sibling to "true". This also + // ensures that step 2 is not attempted when this sibling is visited. + // + // Besides being a minor optimization, this improves the readability of the + // IR and also allows us to later assert the claim that the last successor + // has a trivially true predicate. + auto Preds = OldPredicates[Out]; + OldPredicates.erase(Out); + for (auto In : Incoming) { + auto Predicate = Preds.count(In) ? Preds[In] : BoolFalse; + Phi->addIncoming(Predicate, In); + if (isa(Predicate)) { + continue; + } + bool Found = false; + for (auto &OtherPreds : OldPredicates) { + if (OtherPreds.second.count(In)) { + auto &X = OtherPreds.second[In]; + assert(!isa(X)); + X = BoolTrue; + Found = true; + break; + } + } + assert(Found); + } + } + +#ifndef NDEBUG + // Assert the original claim that the guard predicate for the last block is + // trivially true. + auto Last = Outgoing.back(); + assert(OldPredicates.count(Last)); + assert(OldPredicates.size() == 1); + auto &LastPreds = OldPredicates[Last]; + for (auto P : LastPreds) { + assert(P.second == BoolTrue); + } +#endif // NDEBUG +} + +static void createGuardBlocks(SmallVectorImpl &GuardBlocks, + Function *F, const BBSetVector &Outgoing, + BBPredicates &GuardPredicates, StringRef Prefix) { + // The last guard block can have two outgoing blocks as successors, since the + // condition for the final outgoing block is trivially true. So we create one + // less block (including the Hub) than the number of outgoing blocks. + for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) { + GuardBlocks.push_back( + BasicBlock::Create(F->getContext(), Prefix + ".guard", F)); + } + assert(GuardBlocks.size() == GuardPredicates.size()); + + // To help keep the loop simple, temporarily append the last outgoing block to + // the list of guard blocks. + GuardBlocks.push_back(Outgoing.back()); + + for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) { + auto Out = Outgoing[i]; + assert(GuardPredicates.count(Out)); + BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out], + GuardBlocks[i]); + } + + // Remove the last block from the guard list. + GuardBlocks.pop_back(); +} + +BasicBlock *llvm::CreateControlFlowHub( + DomTreeUpdater *DTU, SmallVectorImpl &GuardBlocks, + const BBSetVector &Incoming, const BBSetVector &Outgoing, + const StringRef Prefix) { + auto F = Incoming.front()->getParent(); + auto FirstGuardBlock = + BasicBlock::Create(F->getContext(), Prefix + ".guard", F); + + PredMap Predicates; + SmallVector Inverted; + collectPredicates(Predicates, Inverted, Incoming, Outgoing); + + SmallVector Updates; + if (DTU) { + for (auto In : Incoming) { + for (auto Succ : successors(In)) { + if (Outgoing.count(Succ)) + Updates.push_back({DominatorTree::Delete, In, Succ}); + } + Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock}); + } + } + + redirectIncomingToHub(Incoming, Outgoing, FirstGuardBlock); + + BBPredicates GuardPredicates; + createGuardPredicates(GuardPredicates, Predicates, Incoming, Outgoing, + FirstGuardBlock); + + GuardBlocks.push_back(FirstGuardBlock); + createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix); + + // Update the PHINodes in each outgoing block to match the new control flow. + for (int i = 0, e = GuardBlocks.size(); i != e; ++i) { + reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock); + } + reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock); + + if (DTU) { + for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) { + Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]}); + Updates.push_back( + {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]}); + } + Updates.push_back( + {DominatorTree::Insert, GuardBlocks.back(), Outgoing.back()}); + DTU->applyUpdates(Updates); + } + + for (auto I : Inverted) { + if (I->use_empty()) + if (auto Inst = dyn_cast_or_null(I)) + Inst->eraseFromParent(); + } + + return FirstGuardBlock; +} diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -63,6 +63,7 @@ StripNonLineTableDebugInfo.cpp SymbolRewriter.cpp UnifyFunctionExitNodes.cpp + UnifyLoopExits.cpp Utils.cpp ValueMapper.cpp VNCoercion.cpp diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3011,3 +3011,34 @@ AllocaForValue[V] = Res; return Res; } + +Value *llvm::invertCondition(Value *Condition) { + // First: Check if it's a constant + if (Constant *C = dyn_cast(Condition)) + return ConstantExpr::getNot(C); + + // Second: If the condition is already inverted, return the original value + Value *NotCondition; + if (match(Condition, m_Not(m_Value(NotCondition)))) + return NotCondition; + + if (Instruction *Inst = dyn_cast(Condition)) { + // Third: Check all the users for an invert + BasicBlock *Parent = Inst->getParent(); + for (User *U : Condition->users()) + if (Instruction *I = dyn_cast(U)) + if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) + return I; + + // Last option: Create a new instruction + return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); + } + + if (Argument *Arg = dyn_cast(Condition)) { + BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock(); + return BinaryOperator::CreateNot(Condition, Arg->getName() + ".inv", + EntryBlock.getTerminator()); + } + + llvm_unreachable("Unhandled condition to invert"); +} diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -0,0 +1,208 @@ +//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// For each natural loop with multiple exit blocks, this pass creates a new +// block N such that all exiting blocks now branch to N, and then control flow +// is redistributed to all the original exit blocks. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "unify-loop-exits" + +using namespace llvm; + +namespace { +struct UnifyLoopExits : public FunctionPass { + static char ID; + UnifyLoopExits() : FunctionPass(ID) { + initializeUnifyLoopExitsPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + } + + bool runOnFunction(Function &F); +}; +} // namespace + +char UnifyLoopExits::ID = 0; + +FunctionPass *llvm::createUnifyLoopExitsPass() { return new UnifyLoopExits(); } + +INITIALIZE_PASS_BEGIN(UnifyLoopExits, "unify-loop-exits", + "Fixup each natural loop to have a single exit block", + false /* Only looks at CFG */, false /* Analysis Pass */) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(UnifyLoopExits, "unify-loop-exits", + "Fixup each natural loop to have a single exit block", + false /* Only looks at CFG */, false /* Analysis Pass */) + +// The current transform introduces new control flow paths which may break the +// SSA requirement that every def must dominate all its uses. For example, +// consider a value D defined inside the loop that is used by some instruction +// U outside the loop. It follows that D dominates U, since the original +// program has valid SSA form. After merging the exits, all paths from D to U +// now flow through the unified exit block. In addition, there may be other +// paths that do not pass through D, but now reach the unified exit +// block. Thus, D no longer dominates U. +// +// Restore the dominance by creating a phi for each such D at the new unified +// loop exit. But when doing this, ignore any uses U that are in the new unified +// loop exit, since those were introduced specially when the block was created. +// +// The use of SSAUpdater seems like overkill for this operation. The location +// for creating the new PHI is well-known, and also the set of incoming blocks +// to the new PHI. +static void restoreSSA(const DominatorTree &DT, const Loop *L, + const SetVector &Incoming, + BasicBlock *LoopExitBlock) { + using InstVector = SmallVector; + using IIMap = DenseMap; + IIMap ExternalUsers; + for (auto BB : L->blocks()) { + for (auto &I : *BB) { + for (auto &U : I.uses()) { + auto UserInst = cast(U.getUser()); + auto UserBlock = UserInst->getParent(); + if (UserBlock == LoopExitBlock) + continue; + if (L->contains(UserBlock)) + continue; + LLVM_DEBUG(dbgs() + << "added ext use for " << I.getName() << "(" + << BB->getName() << ")" + << ": " << UserInst->getName() << "(" << UserBlock->getName() << ")" + << "\n"); + ExternalUsers[&I].push_back(UserInst); + } + } + } + + for (auto II : ExternalUsers) { + // For each Def used outside the loop, create NewPhi in + // LoopExitBlock. NewPhi receives Def only along exiting blocks that + // dominate it, while the remaining values are undefined since those paths + // didn't exist in the original CFG. + auto Def = II.first; + LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n"); + auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(), + Def->getName() + ".moved", + LoopExitBlock->getTerminator()); + for (auto In : Incoming) { + LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": "); + if (Def->getParent() == In || DT.dominates(Def, In)) { + LLVM_DEBUG(dbgs() << "dominated\n"); + NewPhi->addIncoming(Def, In); + } else { + LLVM_DEBUG(dbgs() << "not dominated\n"); + NewPhi->addIncoming(UndefValue::get(Def->getType()), In); + } + } + + LLVM_DEBUG(dbgs() << "external users:"); + for (auto U : II.second) { + LLVM_DEBUG(dbgs() << " " << U->getName()); + U->replaceUsesOfWith(Def, NewPhi); + } + LLVM_DEBUG(dbgs() << "\n"); + } +} + +static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { + // We need SetVectors, but the Loop API takes a vector, so we use a temporary. + SmallVector Temp; + L->getExitingBlocks(Temp); + + // Locate exits by examining the successors of the exiting blocks. This is + // cheaper than calling Loop::getExitBlocks(), since that goes through the + // entire list of blocks in the loop. + SetVector ExitingBlocks; + SetVector Exits; + for (auto BB : Temp) { + ExitingBlocks.insert(BB); + for (auto S : successors(BB)) { + auto SL = LI.getLoopFor(S); + // A successor is not an exit if it is directly or indirectly in the + // current loop. + if (SL == L || L->contains(SL)) + continue; + Exits.insert(S); + } + } + + LLVM_DEBUG( + dbgs() << "Found exit blocks:"; + for (auto Exit : Exits) { + dbgs() << " " << Exit->getName() << ""; + } + dbgs() << "\n"; + + dbgs() << "Found exiting blocks:"; + for (auto EB : ExitingBlocks) { + dbgs() << " " << EB->getName(); + } + dbgs() << "\n";); + + if (Exits.size() <= 1) { + LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n"); + return false; + } + + SmallVector GuardBlocks; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks, + Exits, "loop.exit"); + + restoreSSA(DT, L, ExitingBlocks, LoopExitBlock); + +#if defined(EXPENSIVE_CHECKS) + assert(DT.verify(DominatorTree::VerificationLevel::Full)); +#else + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); +#endif + L->verifyLoop(); + + // The guard blocks were created outside the loop, so they need to become + // members of the parent loop. + if (auto ParentLoop = L->getParentLoop()) { + for (auto G : GuardBlocks) { + ParentLoop->addBasicBlockToLoop(G, LI); + } + ParentLoop->verifyLoop(); + } + LI.verify(DT); + + return true; +} + +bool UnifyLoopExits::runOnFunction(Function &F) { + LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName() + << "\n"); + auto &LI = getAnalysis().getLoopInfo(); + auto &DT = getAnalysis().getDomTree(); + + bool Changed = false; + auto Loops = LI.getLoopsInPreorder(); + for (auto L : Loops) { + LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: " + << LI.getLoopDepth(L->getHeader()) << ")\n"); + Changed |= unifyLoopExits(DT, LI, L); + } + return Changed; +} diff --git a/llvm/lib/Transforms/Utils/Utils.cpp b/llvm/lib/Transforms/Utils/Utils.cpp --- a/llvm/lib/Transforms/Utils/Utils.cpp +++ b/llvm/lib/Transforms/Utils/Utils.cpp @@ -40,6 +40,7 @@ initializeStripGCRelocatesPass(Registry); initializePredicateInfoPrinterLegacyPassPass(Registry); initializeInjectTLIMappingsLegacyPass(Registry); + initializeUnifyLoopExitsPass(Registry); } /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -unify-loop-exits -structurizecfg -S | FileCheck %s + +; The structurizer uses an RPO traversal over a region, along with a +; manual hack that is meant to sort ensure that blocks within a loop +; are all visited before visiting blocks outside the loop. But this +; does not always work as expected. For example the results are +; incorrect when multiple nested loops are involved. + +; The workaround for this is to unify loop exits. Each loop now +; becomes an SESE region with a single header and a single exit. The +; structurizer is a region pass, and it no longer sees the entire loop +; nest in a single region. More importantly, for each loop, the only +; block reachable outside the loop is the region exit, which avoids +; any confusion in the hacked RPO traversal. + +; In the function below, B1 is an exiting block in outer loop H1. It's +; successor inside the loop is the header of another loop H2. Due to +; the incorrect traversal, B1 dominates all the blocks in the +; structurized program, except the header H1. + +define void @exiting-block(i1 %PredH1, i1 %PredB2, i1 %PredB1, i1 %PredH2) { +; CHECK-LABEL: @exiting-block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PREDB2_INV:%.*]] = xor i1 [[PREDB2:%.*]], true +; CHECK-NEXT: [[PREDH1_INV:%.*]] = xor i1 [[PREDH1:%.*]], true +; CHECK-NEXT: br label [[H1:%.*]] +; CHECK: H1: +; CHECK-NEXT: br i1 [[PREDH1_INV]], label [[B1:%.*]], label [[FLOW3:%.*]] +; CHECK: Flow3: +; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ [[PREDB1:%.*]], [[B1]] ], [ [[PREDH1]], [[H1]] ] +; CHECK-NEXT: br i1 [[TMP0]], label [[H2:%.*]], label [[FLOW4:%.*]] +; CHECK: H2: +; CHECK-NEXT: br i1 [[PREDH2:%.*]], label [[B2:%.*]], label [[FLOW:%.*]] +; CHECK: B2: +; CHECK-NEXT: br i1 [[PREDB2_INV]], label [[L2:%.*]], label [[FLOW2:%.*]] +; CHECK: Flow: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[H2]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ [[TMP4:%.*]], [[FLOW2]] ], [ true, [[H2]] ] +; CHECK-NEXT: br i1 [[TMP2]], label [[LOOP_EXIT_GUARD1:%.*]], label [[H2]] +; CHECK: L2: +; CHECK-NEXT: br label [[FLOW2]] +; CHECK: L1: +; CHECK-NEXT: br label [[FLOW5:%.*]] +; CHECK: B1: +; CHECK-NEXT: br label [[FLOW3]] +; CHECK: C: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: Flow5: +; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, [[L1:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: br label [[FLOW4]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: br i1 [[TMP5:%.*]], label [[C:%.*]], label [[EXIT]] +; CHECK: Flow2: +; CHECK-NEXT: [[TMP4]] = phi i1 [ false, [[L2]] ], [ true, [[B2]] ] +; CHECK-NEXT: br label [[FLOW]] +; CHECK: Flow4: +; CHECK-NEXT: [[TMP5]] = phi i1 [ false, [[FLOW5]] ], [ true, [[FLOW3]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP3]], [[FLOW5]] ], [ true, [[FLOW3]] ] +; CHECK-NEXT: br i1 [[TMP6]], label [[LOOP_EXIT_GUARD:%.*]], label [[H1]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: br i1 [[TMP1]], label [[L1]], label [[FLOW5]] +; +entry: + br label %H1 + +H1: ; preds = %L1, %entry + br i1 %PredH1, label %H2, label %B1 + +H2: ; preds = %B1, %L2, %H1 + br i1 %PredH2, label %B2, label %L1 + +B2: ; preds = %H2 + br i1 %PredB2, label %exit, label %L2 + +L2: ; preds = %B2 + br label %H2 + +L1: ; preds = %H2 + br label %H1 + +B1: ; preds = %H1 + br i1 %PredB1, label %H2, label %C + +C: ; preds = %B1 + br label %exit + +exit: ; preds = %C, %B2 + ret void +} + +; The function below has three nested loops. Due to the incorrect +; traversal, H2 dominates H3 in the structurized program, and the +; backedge from L13 to H3 has no equivalent path. + +define void @incorrect-backedge(i1 %PredH2, i1 %PredH3, i1 %PredL2, i1 %PredL13, i1 %PredL1) +; CHECK-LABEL: @incorrect-backedge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PREDL13_INV:%.*]] = xor i1 [[PREDL13:%.*]], true +; CHECK-NEXT: [[PREDH3_INV:%.*]] = xor i1 [[PREDH3:%.*]], true +; CHECK-NEXT: [[PREDL2_INV:%.*]] = xor i1 [[PREDL2:%.*]], true +; CHECK-NEXT: [[PREDH2_INV:%.*]] = xor i1 [[PREDH2:%.*]], true +; CHECK-NEXT: br label [[H1:%.*]] +; CHECK: H1: +; CHECK-NEXT: br label [[H2:%.*]] +; CHECK: H2: +; CHECK-NEXT: br i1 [[PREDH2_INV]], label [[H3:%.*]], label [[FLOW4:%.*]] +; CHECK: H3: +; CHECK-NEXT: br i1 [[PREDH3_INV]], label [[L2:%.*]], label [[FLOW:%.*]] +; CHECK: L2: +; CHECK-NEXT: br i1 [[PREDL2_INV]], label [[L13:%.*]], label [[FLOW3:%.*]] +; CHECK: Flow: +; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ false, [[FLOW3]] ], [ true, [[H3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP6:%.*]], [[FLOW3]] ], [ true, [[H3]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW3]] ], [ true, [[H3]] ] +; CHECK-NEXT: br i1 [[TMP2]], label [[LOOP_EXIT_GUARD2:%.*]], label [[H3]] +; CHECK: L13: +; CHECK-NEXT: br label [[FLOW3]] +; CHECK: Flow5: +; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP8:%.*]], [[LOOP_EXIT_GUARD1:%.*]] ], [ true, [[LOOP_EXIT_GUARD:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ false, [[LOOP_EXIT_GUARD1]] ], [ true, [[LOOP_EXIT_GUARD]] ] +; CHECK-NEXT: br i1 [[TMP4]], label [[L1:%.*]], label [[FLOW6:%.*]] +; CHECK: L1: +; CHECK-NEXT: br label [[FLOW6]] +; CHECK: Flow6: +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[PREDL1:%.*]], [[L1]] ], [ [[TMP3]], [[FLOW5:%.*]] ] +; CHECK-NEXT: br i1 [[TMP5]], label [[EXIT:%.*]], label [[H1]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.exit.guard: +; CHECK-NEXT: br i1 [[TMP11:%.*]], label [[LOOP_EXIT_GUARD1]], label [[FLOW5]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: br label [[FLOW5]] +; CHECK: Flow3: +; CHECK-NEXT: [[TMP6]] = phi i1 [ true, [[L13]] ], [ false, [[L2]] ] +; CHECK-NEXT: [[TMP7]] = phi i1 [ [[PREDL13_INV]], [[L13]] ], [ true, [[L2]] ] +; CHECK-NEXT: br label [[FLOW]] +; CHECK: Flow4: +; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP0]], [[LOOP_EXIT_GUARD2]] ], [ false, [[H2]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[LOOP_EXIT_GUARD2]] ], [ true, [[H2]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ [[TMP1]], [[LOOP_EXIT_GUARD2]] ], [ true, [[H2]] ] +; CHECK-NEXT: [[TMP11]] = xor i1 [[TMP9]], true +; CHECK-NEXT: br i1 [[TMP10]], label [[LOOP_EXIT_GUARD]], label [[H2]] +; CHECK: loop.exit.guard2: +; CHECK-NEXT: br label [[FLOW4]] +; +{ +entry: + br label %H1 + +H1: + br label %H2 + +H2: + br i1 %PredH2, label %L1, label %H3 + +H3: + br i1 %PredH3, label %exit, label %L2 + +L2: + br i1 %PredL2, label %H2, label %L13 + +L13: + br i1 %PredL13, label %H3, label %H1 + +L1: + br i1 %PredL1, label %exit, label %H1 + +exit: + ret void +} diff --git a/llvm/test/Transforms/UnifyLoopExits/basic.ll b/llvm/test/Transforms/UnifyLoopExits/basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/UnifyLoopExits/basic.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -unify-loop-exits -S | FileCheck %s + +define void @loop_1(i1 %PredEntry, i1 %PredB, i1 %PredC, i1 %PredD) { +; CHECK-LABEL: @loop_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[G:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[B:%.*]] +; CHECK: B: +; CHECK-NEXT: br i1 [[PREDB:%.*]], label [[C:%.*]], label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: C: +; CHECK-NEXT: br i1 [[PREDC:%.*]], label [[D:%.*]], label [[LOOP_EXIT_GUARD]] +; CHECK: D: +; CHECK-NEXT: br i1 [[PREDD:%.*]], label [[A]], label [[LOOP_EXIT_GUARD]] +; CHECK: E: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: F: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: G: +; CHECK-NEXT: br label [[F:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_E:%.*]] = phi i1 [ true, [[B]] ], [ false, [[C]] ], [ false, [[D]] ] +; CHECK-NEXT: br i1 [[GUARD_E]], label [[E:%.*]], label [[F]] +; +entry: + br i1 %PredEntry, label %A, label %G + +A: ; preds = %D, %entry + br label %B + +B: ; preds = %A + br i1 %PredB, label %C, label %E + +C: ; preds = %B + br i1 %PredC, label %D, label %F + +D: ; preds = %C + br i1 %PredD, label %A, label %F + +E: ; preds = %B + br label %exit + +F: ; preds = %G, %D, %C + br label %exit + +G: ; preds = %entry + br label %F + +exit: ; preds = %F, %E + ret void +} + +define void @loop_2(i1 %PredA, i1 %PredB, i1 %PredC) { +; CHECK-LABEL: @loop_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: A: +; CHECK-NEXT: br i1 [[PREDA:%.*]], label [[B:%.*]], label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: B: +; CHECK-NEXT: br i1 [[PREDB:%.*]], label [[C:%.*]], label [[LOOP_EXIT_GUARD]] +; CHECK: C: +; CHECK-NEXT: br i1 [[PREDC:%.*]], label [[D:%.*]], label [[LOOP_EXIT_GUARD]] +; CHECK: D: +; CHECK-NEXT: br label [[A]] +; CHECK: X: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: Y: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: Z: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_X:%.*]] = phi i1 [ true, [[A]] ], [ false, [[B]] ], [ false, [[C]] ] +; CHECK-NEXT: [[GUARD_Y:%.*]] = phi i1 [ false, [[A]] ], [ true, [[B]] ], [ false, [[C]] ] +; CHECK-NEXT: br i1 [[GUARD_X]], label [[X:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: br i1 [[GUARD_Y]], label [[Y:%.*]], label [[Z:%.*]] +; +entry: + br label %A + +A: ; preds = %D, %entry + br i1 %PredA, label %B, label %X + +B: ; preds = %A + br i1 %PredB, label %C, label %Y + +C: ; preds = %B + br i1 %PredC, label %D, label %Z + +D: ; preds = %C + br label %A + +X: ; preds = %B + br label %exit + +Y: ; preds = %G, %D, %C + br label %exit + +Z: ; preds = %entry + br label %exit + +exit: ; preds = %F, %E + ret void +} diff --git a/llvm/test/Transforms/UnifyLoopExits/nested.ll b/llvm/test/Transforms/UnifyLoopExits/nested.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/UnifyLoopExits/nested.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -unify-loop-exits -S | FileCheck %s + +define void @nested(i1 %PredB3, i1 %PredB4, i1 %PredA4, i1 %PredA3, i32 %X, i32 %Y, i32 %Z) { +; CHECK-LABEL: @nested( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[A1:%.*]] +; CHECK: A1: +; CHECK-NEXT: br label [[B1:%.*]] +; CHECK: B1: +; CHECK-NEXT: br label [[B2:%.*]] +; CHECK: B2: +; CHECK-NEXT: [[X_INC:%.*]] = add i32 [[X:%.*]], 1 +; CHECK-NEXT: br label [[B3:%.*]] +; CHECK: B3: +; CHECK-NEXT: br i1 [[PREDB3:%.*]], label [[B4:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: B4: +; CHECK-NEXT: br i1 [[PREDB4:%.*]], label [[B1]], label [[LOOP_EXIT_GUARD1]] +; CHECK: A2: +; CHECK-NEXT: br label [[A4:%.*]] +; CHECK: A3: +; CHECK-NEXT: br label [[A4]] +; CHECK: A4: +; CHECK-NEXT: [[A4_PHI:%.*]] = phi i32 [ [[Y:%.*]], [[A3:%.*]] ], [ [[X_INC_MOVED:%.*]], [[A2:%.*]] ] +; CHECK-NEXT: br i1 [[PREDA4:%.*]], label [[LOOP_EXIT_GUARD:%.*]], label [[A5:%.*]] +; CHECK: A5: +; CHECK-NEXT: br i1 [[PREDA3:%.*]], label [[LOOP_EXIT_GUARD]], label [[A1]] +; CHECK: C: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[EXIT_PHI:%.*]] = phi i32 [ [[Z:%.*]], [[C:%.*]] ], [ [[EXIT_PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD]] ] +; CHECK-NEXT: ret void +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_C:%.*]] = phi i1 [ true, [[A4]] ], [ false, [[A5]] ] +; CHECK-NEXT: [[EXIT_PHI_MOVED]] = phi i32 [ undef, [[A4]] ], [ [[A4_PHI]], [[A5]] ] +; CHECK-NEXT: br i1 [[GUARD_C]], label [[C]], label [[EXIT]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: [[GUARD_A3:%.*]] = phi i1 [ true, [[B3]] ], [ false, [[B4]] ] +; CHECK-NEXT: [[X_INC_MOVED]] = phi i32 [ [[X_INC]], [[B3]] ], [ [[X_INC]], [[B4]] ] +; CHECK-NEXT: br i1 [[GUARD_A3]], label [[A3]], label [[A2]] +; +entry: + br label %A1 + +A1: + br label %B1 + +B1: ; preds = %A + br label %B2 + +B2: + %X.inc = add i32 %X, 1 + br label %B3 + +B3: ; preds = %C + br i1 %PredB3, label %B4, label %A3 + +B4: + br i1 %PredB4, label %B1, label %A2 + +A2: + br label %A4 + +A3: + br label %A4 + +A4: + %A4.phi = phi i32 [%Y, %A3], [%X.inc, %A2] + br i1 %PredA4, label %C, label %A5 + +A5: ; preds = %entry + br i1 %PredA3, label %exit, label %A1 + +C: + br label %exit + +exit: + %exit.phi = phi i32 [%A4.phi, %A5], [%Z, %C] + ret void +} diff --git a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -unify-loop-exits -S | FileCheck %s + +; Loop consists of A and B: +; - A is the header +; - A and B are exiting blocks +; - C and return are exit blocks. +; Pattern: Value (%mytmp42) defined in exiting block (A) and used in +; exit block (return). +; The relevant code uses DT::dominates(Value, +; BasicBlock). This is misnamed because it actually checks +; strict dominance, causing the pattern to be miscompiled +; (the use receives an undef value). +define i32 @exiting-used-in-exit(i32* %arg1, i32* %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @exiting-used-in-exit( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: A: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, i32* [[ARG1:%.*]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[B:%.*]], label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, i32* [[ARG2:%.*]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP41]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[A]], label [[LOOP_EXIT_GUARD]] +; CHECK: C: +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MYTMP41_MOVED:%.*]], 1 +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: return: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[INC]], [[C:%.*]] ], [ [[PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A]] ], [ false, [[B]] ] +; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP42]], [[A]] ], [ undef, [[B]] ] +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ undef, [[A]] ], [ [[MYTMP41]], [[B]] ] +; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[C]] +; +entry: + br label %A + +A: ; preds = %B, %A.lr.ph + %mytmp42 = load i32, i32* %arg1, align 4 + %cmp1 = icmp slt i32 %mytmp42, 0 + br i1 %cmp1, label %B, label %return + +B: ; preds = %A + %mytmp41 = load i32, i32* %arg2, align 4 + %cmp = icmp slt i32 %mytmp41, 0 + br i1 %cmp, label %A, label %C + +C: + %inc = add i32 %mytmp41, 1 + br label %return + +return: ; preds = %B38, %A + %phi = phi i32 [ %inc, %C ], [ %mytmp42, %A ] + ret i32 %phi +} + +; Loop consists of A, B and C: +; - A is the header +; - A and C are exiting blocks +; - B is an "internal" block that dominates exiting block C +; - D and return are exit blocks. +; Pattern: Value (%mytmp41) defined in internal block (B) and used in an +; exit block (D). +define i32 @internal-used-in-exit(i32* %arg1, i32* %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @internal-used-in-exit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, i32* [[ARG1:%.*]], align 4 +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: A: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[B:%.*]], label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, i32* [[ARG2:%.*]], align 4 +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: C: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[A]], label [[LOOP_EXIT_GUARD]] +; CHECK: D: +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MYTMP41_MOVED:%.*]], 1 +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: return: +; CHECK-NEXT: ret i32 0 +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A]] ], [ false, [[C]] ] +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ undef, [[A]] ], [ [[MYTMP41]], [[C]] ] +; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D:%.*]] +; +entry: + %mytmp42 = load i32, i32* %arg1, align 4 + br label %A + +A: ; preds = %B, %A.lr.ph + %cmp1 = icmp slt i32 %mytmp42, 0 + br i1 %cmp1, label %B, label %return + +B: ; preds = %A + %mytmp41 = load i32, i32* %arg2, align 4 + br label %C + +C: + %cmp = icmp slt i32 %mytmp42, 0 + br i1 %cmp, label %A, label %D + +D: + %inc = add i32 %mytmp41, 1 + br label %return + +return: ; preds = %B38, %A + ret i32 0 +} + +; Loop consists of A, B and C: +; - A is the header +; - A and C are exiting blocks +; - B is an "internal" block that dominates exiting block C +; - D and return are exit blocks. +; Pattern: %return contains a phi node that receives values from +; %entry, %A and %D. This mixes all the special cases in a single phi. +define i32 @mixed-use-in-exit(i32* %arg1, i32* %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @mixed-use-in-exit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, i32* [[ARG1:%.*]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP2]], label [[A:%.*]], label [[RETURN:%.*]] +; CHECK: A: +; CHECK-NEXT: [[MYTMP43:%.*]] = add i32 [[MYTMP42]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[B:%.*]], label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, i32* [[ARG2:%.*]], align 4 +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: C: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[A]], label [[LOOP_EXIT_GUARD]] +; CHECK: D: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[MYTMP41_MOVED:%.*]], [[D:%.*]] ], [ [[MYTMP42]], [[ENTRY:%.*]] ], [ [[PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A]] ], [ false, [[C]] ] +; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP43]], [[A]] ], [ undef, [[C]] ] +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ undef, [[A]] ], [ [[MYTMP41]], [[C]] ] +; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D]] +; +entry: + %mytmp42 = load i32, i32* %arg1, align 4 + %cmp2 = icmp slt i32 %mytmp42, 0 + br i1 %cmp2, label %A, label %return + +A: ; preds = %B, %A.lr.ph + %mytmp43 = add i32 %mytmp42, 1 + %cmp1 = icmp slt i32 %mytmp42, 0 + br i1 %cmp1, label %B, label %return + +B: ; preds = %A + %mytmp41 = load i32, i32* %arg2, align 4 + br label %C + +C: + %cmp = icmp slt i32 %mytmp42, 0 + br i1 %cmp, label %A, label %D + +D: + br label %return + +return: ; preds = %B38, %A + %phi = phi i32 [ %mytmp41, %D ], [ %mytmp43, %A ], [%mytmp42, %entry] + ret i32 %phi +} + +; Loop consists of A, B and C: +; - A is the header +; - A and C are exiting blocks +; - B is an "internal" block that dominates exiting block C +; - D and E are exit blocks. +; Pattern: Value (%mytmp41) defined in internal block (B) and used in a +; downstream block not related to the loop (return). The use +; is a phi where the incoming block for %mytmp41 is not related +; to the loop (D). +; This pattern does not involve either the exiting blocks or +; the exit blocks, which catches any such assumptions built +; into the SSA reconstruction phase. +define i32 @phi-via-external-block(i32* %arg1, i32* %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @phi-via-external-block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, i32* [[ARG1:%.*]], align 4 +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: A: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[B:%.*]], label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, i32* [[ARG2:%.*]], align 4 +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: C: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[A]], label [[LOOP_EXIT_GUARD]] +; CHECK: D: +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: E: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[MYTMP41_MOVED:%.*]], [[D:%.*]] ], [ [[MYTMP42]], [[E:%.*]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_E:%.*]] = phi i1 [ true, [[A]] ], [ false, [[C]] ] +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ undef, [[A]] ], [ [[MYTMP41]], [[C]] ] +; CHECK-NEXT: br i1 [[GUARD_E]], label [[E]], label [[D]] +; +entry: + %mytmp42 = load i32, i32* %arg1, align 4 + br label %A + +A: ; preds = %B, %A.lr.ph + %cmp1 = icmp slt i32 %mytmp42, 0 + br i1 %cmp1, label %B, label %E + +B: ; preds = %A + %mytmp41 = load i32, i32* %arg2, align 4 + br label %C + +C: + %cmp = icmp slt i32 %mytmp42, 0 + br i1 %cmp, label %A, label %D + +D: + br label %return + +E: + br label %return + +return: ; preds = %B38, %A + %phi = phi i32 [ %mytmp41, %D ], [ %mytmp42, %E ] + ret i32 %phi +} diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -316,6 +316,7 @@ initializeScalarizeMaskedMemIntrinPass(*Registry); initializeExpandReductionsPass(*Registry); initializeHardwareLoopsPass(*Registry); + initializeTransformUtils(*Registry); // Initialize debugging passes. initializeScavengerTestPass(*Registry);