diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -124,6 +124,7 @@ void initializeDAEPass(PassRegistry&); void initializeDAHPass(PassRegistry&); void initializeDCELegacyPassPass(PassRegistry&); +void initializeDFAJumpThreadingLegacyPassPass(PassRegistry &); void initializeDSELegacyPassPass(PassRegistry&); void initializeDataFlowSanitizerLegacyPassPass(PassRegistry &); void initializeDeadMachineInstructionElimPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -174,6 +174,7 @@ (void) llvm::createStripDeadPrototypesPass(); (void) llvm::createTailCallEliminationPass(); (void) llvm::createJumpThreadingPass(); + (void) llvm::createDFAJumpThreadingPass(); (void) llvm::createUnifyFunctionExitNodesPass(); (void) llvm::createInstCountPass(); (void) llvm::createConstantHoistingPass(); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -253,6 +253,14 @@ FunctionPass *createJumpThreadingPass(bool FreezeSelectCond = false, int Threshold = -1); +//===----------------------------------------------------------------------===// +// +// DFAJumpThreading - When a switch statement inside a loop is used to +// implement a deterministic finite automata we can jump thread the switch +// statement reducing number of conditional jumps. +// +FunctionPass *createDFAJumpThreadingPass(); + //===----------------------------------------------------------------------===// // // CFGSimplification - Merge basic blocks, eliminate unreachable blocks, diff --git a/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h b/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h @@ -0,0 +1,27 @@ +//===- DFAJumpThreading.h - Threads a switch statement inside a loop ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the interface for the DFAJumpThreading pass. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H +#define LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct DFAJumpThreadingPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -145,6 +145,7 @@ #include "llvm/Transforms/Scalar/ConstraintElimination.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" +#include "llvm/Transforms/Scalar/DFAJumpThreading.h" #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/Transforms/Scalar/DivRemPairs.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" @@ -302,6 +303,7 @@ extern cl::opt EnableLoopInterchange; extern cl::opt EnableUnrollAndJam; extern cl::opt EnableLoopFlatten; +extern cl::opt EnableDFAJumpThreading; extern cl::opt RunNewGVN; extern cl::opt RunPartialInlining; extern cl::opt ExtraVectorizerPasses; @@ -829,6 +831,9 @@ // Re-consider control flow based optimizations after redundancy elimination, // redo DCE, etc. + if (EnableDFAJumpThreading && Level.getSizeLevel() == 0) + FPM.addPass(DFAJumpThreadingPass()); + FPM.addPass(JumpThreadingPass()); FPM.addPass(CorrelatedValuePropagationPass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -212,6 +212,7 @@ FUNCTION_PASS("coro-cleanup", CoroCleanupPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) +FUNCTION_PASS("dfa-jump-threading", DFAJumpThreadingPass()) FUNCTION_PASS("div-rem-pairs", DivRemPairsPass()) FUNCTION_PASS("dse", DSEPass()) FUNCTION_PASS("dot-cfg", CFGPrinterPass()) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -99,6 +99,10 @@ cl::Hidden, cl::desc("Enable the LoopFlatten Pass")); +cl::opt EnableDFAJumpThreading("enable-dfa-jump-thread", + cl::desc("Enable DFA jump threading."), + cl::init(false), cl::Hidden); + static cl::opt EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -500,6 +504,9 @@ MPM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, MPM); if (OptLevel > 1) { + if (EnableDFAJumpThreading && SizeLevel == 0) + MPM.add(createDFAJumpThreadingPass()); + MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); } diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -9,6 +9,7 @@ CorrelatedValuePropagation.cpp DCE.cpp DeadStoreElimination.cpp + DFAJumpThreading.cpp DivRemPairs.cpp EarlyCSE.cpp FlattenCFGPass.cpp diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -0,0 +1,1287 @@ +//===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Transform each threading path to effectively jump thread the DFA. For +// example, the CFG below could be transformed as follows, where the cloned +// blocks unconditionally branch to the next correct case based on what is +// identified in the analysis. +// +// sw.bb sw.bb +// / | \ / | \ +// case1 case2 case3 case1 case2 case3 +// \ | / | | | +// determinator det.2 det.3 det.1 +// br sw.bb / | \ +// sw.bb.2 sw.bb.3 sw.bb.1 +// br case2 br case3 br case1ยง +// +// Definitions and Terminology: +// +// * Threading path: +// a list of basic blocks, the exit state, and the block that determines +// the next state, for which the following notation will be used: +// < path of BBs that form a cycle > [ state, determinator ] +// +// * Predictable switch: +// The switch variable is always a known constant so that all conditional +// jumps based on switch variable can be converted to unconditional jump. +// +// * Determinator: +// The basic block that determines the next state of the DFA. +// +// Representing the optimization in C-like pseudocode: the code pattern on the +// left could functionally be transformed to the right pattern if the switch +// condition is predictable. +// +// X = A goto A +// for (...) A: +// switch (X) ... +// case A goto B +// X = B B: +// case B ... +// X = C goto C +// +// The pass first checks that switch variable X is decided by the control flow +// path taken in the loop; for example, in case B, the next value of X is +// decided to be C. It then enumerates through all paths in the loop and labels +// the basic blocks where the next state is decided. +// +// Using this information it creates new paths that unconditionally branch to +// the next case. This involves cloning code, so it only gets triggered if the +// amount of code duplicated is below a threshold. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/DFAJumpThreading.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SSAUpdaterBulk.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "dfa-jump-threading" + +STATISTIC(NumTransforms, "Number of transformations done"); +STATISTIC(NumCloned, "Number of blocks cloned"); +STATISTIC(NumPaths, "Number of individual paths threaded"); + +static cl::opt + ClViewCfgBefore("dfa-jump-view-cfg-before", + cl::desc("View the CFG before DFA Jump Threading"), + cl::Hidden, cl::init(false)); + +static cl::opt MaxPathLength( + "dfa-max-path-length", + cl::desc("Max number of blocks searched to find a threading path"), + cl::Hidden, cl::init(20)); + +static cl::opt + CostThreshold("dfa-cost-threshold", + cl::desc("Maximum cost accepted for the transformation"), + cl::Hidden, cl::init(50)); + +namespace { + +class SelectInstToUnfold { + SelectInst *SI; + PHINode *SIUse; + +public: + SelectInstToUnfold(SelectInst *SI, PHINode *SIUse) : SI(SI), SIUse(SIUse) {} + + SelectInst *getInst() { return SI; } + PHINode *getUse() { return SIUse; } + + explicit operator bool() const { return SI && SIUse; } +}; + +void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, + std::vector *NewSIsToUnfold, + std::vector *NewBBs); + +class DFAJumpThreading { +public: + DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, + TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) + : AC(AC), DT(DT), TTI(TTI), ORE(ORE) {} + + bool run(Function &F); + +private: + void + unfoldSelectInstrs(DominatorTree *DT, + const SmallVector &SelectInsts) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SmallVector Stack; + for (SelectInstToUnfold SIToUnfold : SelectInsts) + Stack.push_back(SIToUnfold); + + while (!Stack.empty()) { + SelectInstToUnfold SIToUnfold = Stack.back(); + Stack.pop_back(); + + std::vector NewSIsToUnfold; + std::vector NewBBs; + unfold(&DTU, SIToUnfold, &NewSIsToUnfold, &NewBBs); + + // Put newly discovered select instructions into the work list. + for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold) + Stack.push_back(NewSIToUnfold); + } + } + + AssumptionCache *AC; + DominatorTree *DT; + TargetTransformInfo *TTI; + OptimizationRemarkEmitter *ORE; +}; + +class DFAJumpThreadingLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification + DFAJumpThreadingLegacyPass() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + AssumptionCache *AC = + &getAnalysis().getAssumptionCache(F); + DominatorTree *DT = &getAnalysis().getDomTree(); + TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + OptimizationRemarkEmitter *ORE = + &getAnalysis().getORE(); + + return DFAJumpThreading(AC, DT, TTI, ORE).run(F); + } +}; +} // end anonymous namespace + +char DFAJumpThreadingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(DFAJumpThreadingLegacyPass, "dfa-jump-threading", + "DFA Jump Threading", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(DFAJumpThreadingLegacyPass, "dfa-jump-threading", + "DFA Jump Threading", false, false) + +// Public interface to the DFA Jump Threading pass +FunctionPass *llvm::createDFAJumpThreadingPass() { + return new DFAJumpThreadingLegacyPass(); +} + +namespace { + +/// Create a new basic block and sink \p SIToSink into it. +void createBasicBlockAndSinkSelectInst( + DomTreeUpdater *DTU, SelectInst *SI, PHINode *SIUse, SelectInst *SIToSink, + BasicBlock *EndBlock, StringRef NewBBName, BasicBlock **NewBlock, + BranchInst **NewBranch, std::vector *NewSIsToUnfold, + std::vector *NewBBs) { + assert(SIToSink->hasOneUse()); + assert(NewBlock); + assert(NewBranch); + *NewBlock = BasicBlock::Create(SI->getContext(), NewBBName, + EndBlock->getParent(), EndBlock); + NewBBs->push_back(*NewBlock); + *NewBranch = BranchInst::Create(EndBlock, *NewBlock); + SIToSink->moveBefore(*NewBranch); + NewSIsToUnfold->push_back(SelectInstToUnfold(SIToSink, SIUse)); + DTU->applyUpdates({{DominatorTree::Insert, *NewBlock, EndBlock}}); +} + +/// Unfold the select instruction held in \p SIToUnfold by replacing it with +/// control flow. +/// +/// Put newly discovered select instructions into \p NewSIsToUnfold. Put newly +/// created basic blocks into \p NewBBs. +/// +/// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. +void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, + std::vector *NewSIsToUnfold, + std::vector *NewBBs) { + SelectInst *SI = SIToUnfold.getInst(); + PHINode *SIUse = SIToUnfold.getUse(); + BasicBlock *StartBlock = SI->getParent(); + BasicBlock *EndBlock = SIUse->getParent(); + BranchInst *StartBlockTerm = + dyn_cast(StartBlock->getTerminator()); + + assert(StartBlockTerm && StartBlockTerm->isUnconditional()); + assert(SI->hasOneUse()); + + // These are the new basic blocks for the conditional branch. + // At least one will become an actual new basic block. + BasicBlock *TrueBlock = nullptr; + BasicBlock *FalseBlock = nullptr; + BranchInst *TrueBranch = nullptr; + BranchInst *FalseBranch = nullptr; + + // Sink select instructions to be able to unfold them later. + if (SelectInst *SIOp = dyn_cast(SI->getTrueValue())) { + createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIOp, EndBlock, + "si.unfold.true", &TrueBlock, &TrueBranch, + NewSIsToUnfold, NewBBs); + } + if (SelectInst *SIOp = dyn_cast(SI->getFalseValue())) { + createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIOp, EndBlock, + "si.unfold.false", &FalseBlock, + &FalseBranch, NewSIsToUnfold, NewBBs); + } + + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (!TrueBlock && !FalseBlock) { + FalseBlock = BasicBlock::Create(SI->getContext(), "si.unfold.false", + EndBlock->getParent(), EndBlock); + NewBBs->push_back(FalseBlock); + BranchInst::Create(EndBlock, FalseBlock); + DTU->applyUpdates({{DominatorTree::Insert, FalseBlock, EndBlock}}); + } + + // Insert the real conditional branch based on the original condition. + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. + BasicBlock *TT = EndBlock; + BasicBlock *FT = EndBlock; + if (TrueBlock && FalseBlock) { + // A diamond. + TT = TrueBlock; + FT = FalseBlock; + + // Update the phi node of SI. + SIUse->removeIncomingValue(StartBlock, /* DeletePHIIfEmpty = */ false); + SIUse->addIncoming(SI->getTrueValue(), TrueBlock); + SIUse->addIncoming(SI->getFalseValue(), FalseBlock); + + // Update any other PHI nodes in EndBlock. + for (PHINode &Phi : EndBlock->phis()) { + if (&Phi != SIUse) { + Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), TrueBlock); + Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), FalseBlock); + } + } + } else { + BasicBlock *NewBlock = nullptr; + Value *SIOp1 = SI->getTrueValue(); + Value *SIOp2 = SI->getFalseValue(); + + // A triangle pointing right. + if (!TrueBlock) { + NewBlock = FalseBlock; + FT = FalseBlock; + } + // A triangle pointing left. + else { + NewBlock = TrueBlock; + TT = TrueBlock; + std::swap(SIOp1, SIOp2); + } + + // Update the phi node of SI. + for (unsigned Idx = 0; Idx < SIUse->getNumIncomingValues(); ++Idx) { + if (SIUse->getIncomingBlock(Idx) == StartBlock) + SIUse->setIncomingValue(Idx, SIOp1); + } + SIUse->addIncoming(SIOp2, NewBlock); + + // Update any other PHI nodes in EndBlock. + for (auto II = EndBlock->begin(); PHINode *Phi = dyn_cast(II); + ++II) { + if (Phi != SIUse) + Phi->addIncoming(Phi->getIncomingValueForBlock(StartBlock), NewBlock); + } + } + StartBlockTerm->eraseFromParent(); + BranchInst::Create(TT, FT, SI->getCondition(), StartBlock); + DTU->applyUpdates({{DominatorTree::Insert, StartBlock, TT}, + {DominatorTree::Insert, StartBlock, FT}}); + + // The select is now dead. + SI->eraseFromParent(); +} + +struct ClonedBlock { + BasicBlock *BB; + uint64_t State; ///< \p State corresponds to the next value of a switch stmnt. +}; + +typedef std::deque PathType; +typedef std::vector PathsType; +typedef std::set VisitedBlocks; +typedef std::vector CloneList; + +// This data structure keeps track of all blocks that have been cloned. If two +// different ThreadingPaths clone the same block for a certain state it should +// be reused, and it can be looked up in this map. +typedef DenseMap DuplicateBlockMap; + +// This map keeps track of all the new definitions for an instruction. This +// information is needed when restoring SSA form after cloning blocks. +typedef DenseMap> DefMap; + +inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { + OS << "< "; + for (const BasicBlock *BB : Path) { + std::string BBName; + if (BB->hasName()) + raw_string_ostream(BBName) << BB->getName(); + else + raw_string_ostream(BBName) << BB; + OS << BBName << " "; + } + OS << ">"; + return OS; +} + +/// ThreadingPath is a path in the control flow of a loop that can be threaded +/// by cloning necessary basic blocks and replacing conditional branches with +/// unconditional ones. A threading path includes a list of basic blocks, the +/// exit state, and the block that determines the next state. +struct ThreadingPath { + /// Exit value is DFA's exit state for the given path. + uint64_t getExitValue() const { return ExitVal; } + void setExitValue(const ConstantInt *V) { + ExitVal = V->getZExtValue(); + IsExitValSet = true; + } + bool isExitValueSet() const { return IsExitValSet; } + + /// Determinator is the basic block that determines the next state of the DFA. + const BasicBlock *getDeterminatorBB() const { return DBB; } + void setDeterminator(const BasicBlock *BB) { DBB = BB; } + + /// Path is a list of basic blocks. + const PathType &getPath() const { return Path; } + void setPath(const PathType &NewPath) { Path = NewPath; } + + void print(raw_ostream &OS) const { + OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]"; + } + +private: + PathType Path; + uint64_t ExitVal; + const BasicBlock *DBB = nullptr; + bool IsExitValSet = false; +}; + +#ifndef NDEBUG +inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { + TPath.print(OS); + return OS; +} +#endif + +struct MainSwitch { + MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) { + if (isPredictable(SI)) { + Instr = SI; + } else { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable", SI) + << "Switch instruction is not predictable."; + }); + } + } + + virtual ~MainSwitch() = default; + + SwitchInst *getInstr() const { return Instr; } + const SmallVector getSelectInsts() { + return SelectInsts; + } + +private: + /// Do a use-def chain traversal. Make sure the value of the switch variable + /// is always a known constant. This means that all conditional jumps based on + /// switch variable can be converted to unconditional jumps. + bool isPredictable(const SwitchInst *SI) { + std::deque Q; + SmallSet SeenValues; + SelectInsts.clear(); + + Value *FirstDef = SI->getOperand(0); + auto *Inst = dyn_cast(FirstDef); + + // If this is a function argument or another non-instruction, then give up. + // We are interested in loop local variables. + if (!Inst) + return false; + + // Require the first definition to be a PHINode + if (!isa(Inst)) + return false; + + LLVM_DEBUG(dbgs() << "\tisPredictable() FirstDef: " << *Inst << "\n"); + + Q.push_back(Inst); + SeenValues.insert(FirstDef); + + while (!Q.empty()) { + Instruction *Current = Q.front(); + Q.pop_front(); + + if (auto *Phi = dyn_cast(Current)) { + for (Value *Incoming : Phi->incoming_values()) { + if (!isPredictableValue(Incoming, SeenValues)) + return false; + addInstToQueue(Incoming, Q, SeenValues); + } + LLVM_DEBUG(dbgs() << "\tisPredictable() phi: " << *Phi << "\n"); + } else if (SelectInst *SelI = dyn_cast(Current)) { + if (!isValidSelectInst(SelI)) + return false; + if (!isPredictableValue(SelI->getTrueValue(), SeenValues) || + !isPredictableValue(SelI->getFalseValue(), SeenValues)) { + return false; + } + addInstToQueue(SelI->getTrueValue(), Q, SeenValues); + addInstToQueue(SelI->getFalseValue(), Q, SeenValues); + LLVM_DEBUG(dbgs() << "\tisPredictable() select: " << *SelI << "\n"); + if (auto *SelIUse = dyn_cast(SelI->user_back())) + SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse)); + } else { + // If it is neither a phi nor a select, then we give up. + return false; + } + } + + return true; + } + + bool isPredictableValue(Value *InpVal, SmallSet &SeenValues) { + if (SeenValues.find(InpVal) != SeenValues.end()) + return true; + + if (isa(InpVal)) + return true; + + // If this is a function argument or another non-instruction, then give up. + if (!isa(InpVal)) + return false; + + return true; + } + + void addInstToQueue(Value *Val, std::deque &Q, + SmallSet &SeenValues) { + if (SeenValues.find(Val) != SeenValues.end()) + return; + if (Instruction *I = dyn_cast(Val)) + Q.push_back(I); + SeenValues.insert(Val); + } + + bool isValidSelectInst(SelectInst *SI) { + if (!SI->hasOneUse()) + return false; + + Instruction *SIUse = dyn_cast(SI->user_back()); + // The use of the select inst should be either a phi or another select. + if (!SIUse && !(isa(SIUse) || isa(SIUse))) + return false; + + BasicBlock *SIBB = SI->getParent(); + + // Currently, we can only expand select instructions in basic blocks with + // one successor. + BranchInst *SITerm = dyn_cast(SIBB->getTerminator()); + if (!SITerm || !SITerm->isUnconditional()) + return false; + + if (isa(SIUse) && + SIBB->getSingleSuccessor() != dyn_cast(SIUse)->getParent()) + return false; + + // If select will not be sunk during unfolding, and it is in the same basic + // block as another state defining select, then cannot unfold both. + for (SelectInstToUnfold SIToUnfold : SelectInsts) { + SelectInst *PrevSI = SIToUnfold.getInst(); + if (PrevSI->getTrueValue() != SI && PrevSI->getFalseValue() != SI && + PrevSI->getParent() == SI->getParent()) + return false; + } + + return true; + } + + SwitchInst *Instr = nullptr; + SmallVector SelectInsts; +}; + +struct AllSwitchPaths { + AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE) + : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), + ORE(ORE) {} + + std::vector &getThreadingPaths() { return TPaths; } + unsigned getNumThreadingPaths() { return TPaths.size(); } + SwitchInst *getSwitchInst() { return Switch; } + BasicBlock *getSwitchBlock() { return SwitchBlock; } + + void run() { + VisitedBlocks Visited; + PathsType LoopPaths = paths(SwitchBlock, Visited, /* PathDepth = */ 1); + StateDefMap StateDef = getStateDefMap(); + + for (PathType Path : LoopPaths) { + ThreadingPath TPath; + + const BasicBlock *PrevBB = Path.back(); + for (const BasicBlock *BB : Path) { + if (StateDef.count(BB) != 0) { + const PHINode *Phi = dyn_cast(StateDef[BB]); + assert(Phi && "Expected a state-defining instr to be a phi node."); + + const Value *V = Phi->getIncomingValueForBlock(PrevBB); + if (const ConstantInt *C = dyn_cast(V)) { + TPath.setExitValue(C); + TPath.setDeterminator(BB); + TPath.setPath(Path); + } + } + + // Switch block is the determinator, this is the final exit value. + if (TPath.isExitValueSet() && BB == Path.front()) + break; + + PrevBB = BB; + } + + if (TPath.isExitValueSet()) + TPaths.push_back(TPath); + } + } + +private: + // Value: an instruction that defines a switch state; + // Key: the parent basic block of that instruction. + typedef DenseMap StateDefMap; + + PathsType paths(BasicBlock *BB, VisitedBlocks &Visited, + unsigned PathDepth) const { + PathsType Res; + + // Stop exploring paths after visiting MaxPathLength blocks + if (PathDepth > MaxPathLength) { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "MaxPathLengthReached", + Switch) + << "Exploration stopped after visiting MaxPathLength=" + << ore::NV("MaxPathLength", MaxPathLength) << " blocks."; + }); + return Res; + } + + Visited.insert(BB); + + // Some blocks have multiple edges to the same successor, and this set + // is used to prevent a duplicate path from being generated + SmallSet Successors; + + for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) { + BasicBlock *Succ = *SI; + + if (Successors.find(Succ) != Successors.end()) + continue; + Successors.insert(Succ); + + // Found a cycle through the SwitchBlock + if (Succ == SwitchBlock) { + Res.push_back({BB}); + continue; + } + + // We have encountered a cycle, do not get caught in it + if (Visited.find(Succ) != Visited.end()) + continue; + + PathsType SuccPaths = paths(Succ, Visited, PathDepth + 1); + for (PathType Path : SuccPaths) { + PathType NewPath(Path); + NewPath.push_front(BB); + Res.push_back(NewPath); + } + } + // This block could now be visited again from a different predecessor. Note + // that this will result in exponential runtime. Subpaths could possibly be + // cached but it takes a lot of memory to store them. + Visited.erase(BB); + return Res; + } + + /// Walk the use-def chain and collect all the state-defining instructions. + StateDefMap getStateDefMap() const { + StateDefMap Res; + + Value *FirstDef = Switch->getOperand(0); + + assert(isa(FirstDef) && "After select unfolding, all state " + "definitions are expected to be phi " + "nodes."); + + SmallVector Stack; + Stack.push_back(dyn_cast(FirstDef)); + SmallSet SeenValues; + + while (!Stack.empty()) { + PHINode *CurPhi = Stack.back(); + Stack.pop_back(); + + Res[CurPhi->getParent()] = CurPhi; + SeenValues.insert(CurPhi); + + for (Value *Incoming : CurPhi->incoming_values()) { + if (Incoming == FirstDef || isa(Incoming) || + SeenValues.find(Incoming) != SeenValues.end()) { + continue; + } + + assert(isa(Incoming) && "After select unfolding, all state " + "definitions are expected to be phi " + "nodes."); + + Stack.push_back(cast(Incoming)); + } + } + + return Res; + } + + SwitchInst *Switch; + BasicBlock *SwitchBlock; + OptimizationRemarkEmitter *ORE; + std::vector TPaths; +}; + +struct TransformDFA { + TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT, + AssumptionCache *AC, TargetTransformInfo *TTI, + OptimizationRemarkEmitter *ORE, + SmallPtrSet EphValues) + : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE), + EphValues(EphValues) {} + + void run() { + if (isLegalAndProfitableToTransform()) { + createAllExitPaths(); + NumTransforms++; + } + } + +private: + /// This function performs both a legality check and profitability check at + /// the same time since it is convenient to do so. It iterates through all + /// blocks that will be cloned, and keeps track of the duplication cost. It + /// also returns false if it is illegal to clone some required block. + bool isLegalAndProfitableToTransform() { + CodeMetrics Metrics; + SwitchInst *Switch = SwitchPaths->getSwitchInst(); + + // Note that DuplicateBlockMap is not being used as intended here. It is + // just being used to ensure (BB, State) pairs are only counted once. + DuplicateBlockMap DuplicateMap; + + for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { + PathType PathBBs = TPath.getPath(); + uint64_t NextState = TPath.getExitValue(); + const BasicBlock *Determinator = TPath.getDeterminatorBB(); + + // Update Metrics for the Switch block, this is always cloned + BasicBlock *BB = SwitchPaths->getSwitchBlock(); + BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); + if (!VisitedBB) { + Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + DuplicateMap[BB].push_back({BB, NextState}); + } + + // If the Switch block is the Determinator, then we can continue since + // this is the only block that is cloned and we already counted for it. + if (PathBBs.front() == Determinator) + continue; + + // Otherwise update Metrics for all blocks that will be cloned. If any + // block is already cloned and would be reused, don't double count it. + auto DetIt = std::find(PathBBs.begin(), PathBBs.end(), Determinator); + for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { + BB = *BBIt; + VisitedBB = getClonedBB(BB, NextState, DuplicateMap); + if (VisitedBB) + continue; + Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + DuplicateMap[BB].push_back({BB, NextState}); + } + + if (Metrics.notDuplicatable) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " + << "non-duplicatable instructions.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NonDuplicatableInst", + Switch) + << "Contains non-duplicatable instructions."; + }); + return false; + } + + if (Metrics.convergent) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " + << "convergent instructions.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch) + << "Contains convergent instructions."; + }); + return false; + } + } + + unsigned DuplicationCost = 0; + + unsigned JumpTableSize = 0; + TTI->getEstimatedNumberOfCaseClusters(*Switch, JumpTableSize, nullptr, + nullptr); + if (JumpTableSize == 0) { + // Factor in the number of conditional branches reduced from jump + // threading. Assume that lowering the switch block is implemented by + // using binary search, hence the LogBase2(). + unsigned CondBranches = + APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); + DuplicationCost = Metrics.NumInsts / CondBranches; + } else { + // Compared with jump tables, the DFA optimizer removes an indirect branch + // on each loop iteration, thus making branch prediction more precise. The + // more branch targets there are, the more likely it is for the branch + // predictor to make a mistake, and the more benefit there is in the DFA + // optimizer. Thus, the more branch targets there are, the lower is the + // cost of the DFA opt. + DuplicationCost = Metrics.NumInsts / JumpTableSize; + } + + LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " + << SwitchPaths->getSwitchBlock()->getName() + << " is: " << DuplicationCost << "\n\n"); + + if (DuplicationCost > CostThreshold) { + LLVM_DEBUG(dbgs() << "Not jump threading, duplication cost exceeds the " + << "cost threshold.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) + << "Duplication cost exceeds the cost threshold (cost=" + << ore::NV("Cost", DuplicationCost) + << ", threshold=" << ore::NV("Threshold", CostThreshold) << ")."; + }); + return false; + } + + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "JumpThreaded", Switch) + << "Switch statement jump-threaded."; + }); + + return true; + } + + /// Transform each threading path to effectively jump thread the DFA. + void createAllExitPaths() { + DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager); + + // Move the switch block to the end of the path, since it will be duplicated + BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock(); + for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { + LLVM_DEBUG(dbgs() << TPath << "\n"); + PathType NewPath(TPath.getPath()); + NewPath.push_back(SwitchBlock); + TPath.setPath(NewPath); + } + + // Transform the ThreadingPaths and keep track of the cloned values + DuplicateBlockMap DuplicateMap; + DefMap NewDefs; + + SmallSet BlocksToClean; + for (BasicBlock *BB : successors(SwitchBlock)) + BlocksToClean.insert(BB); + + for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { + createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU); + NumPaths++; + } + + // After all paths are cloned, now update the last successor of the cloned + // path so it skips over the switch statement + for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) + updateLastSuccessor(TPath, DuplicateMap, &DTU); + + // For each instruction that was cloned and used outside, update its uses + updateSSA(NewDefs); + + // Clean PHI Nodes for the newly created blocks + for (BasicBlock *BB : BlocksToClean) + cleanPhiNodes(BB); + } + + /// For a specific ThreadingPath \p Path, create an exit path starting from + /// the determinator block. + /// + /// To remember the correct destination, we have to duplicate blocks + /// corresponding to each state. Also update the terminating instruction of + /// the predecessors, and phis in the successor blocks. + void createExitPath(DefMap &NewDefs, ThreadingPath &Path, + DuplicateBlockMap &DuplicateMap, + SmallSet &BlocksToClean, + DomTreeUpdater *DTU) { + uint64_t NextState = Path.getExitValue(); + const BasicBlock *Determinator = Path.getDeterminatorBB(); + PathType PathBBs = Path.getPath(); + + // Don't select the placeholder block in front + if (PathBBs.front() == Determinator) + PathBBs.pop_front(); + + auto DetIt = std::find(PathBBs.begin(), PathBBs.end(), Determinator); + auto Prev = std::prev(DetIt); + BasicBlock *PrevBB = *Prev; + for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { + BasicBlock *BB = *BBIt; + BlocksToClean.insert(BB); + + // We already cloned BB for this NextState, now just update the branch + // and continue. + BasicBlock *NextBB = getClonedBB(BB, NextState, DuplicateMap); + if (NextBB) { + updatePredecessor(PrevBB, BB, NextBB, DTU); + PrevBB = NextBB; + continue; + } + + // Clone the BB and update the successor of Prev to jump to the new block + BasicBlock *NewBB = cloneBlockAndUpdatePredecessor( + BB, PrevBB, NextState, DuplicateMap, NewDefs, DTU); + DuplicateMap[BB].push_back({NewBB, NextState}); + BlocksToClean.insert(NewBB); + PrevBB = NewBB; + } + } + + /// Restore SSA form after cloning blocks. + /// + /// Each cloned block creates new defs for a variable, and the uses need to be + /// updated to reflect this. The uses may be replaced with a cloned value, or + /// some derived phi instruction. Note that all uses of a value defined in the + /// same block were already remapped when cloning the block. + void updateSSA(DefMap &NewDefs) { + SSAUpdaterBulk SSAUpdate; + SmallVector UsesToRename; + + for (auto KV : NewDefs) { + Instruction *I = KV.first; + BasicBlock *BB = I->getParent(); + std::vector Cloned = KV.second; + + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Use &U : I->uses()) { + Instruction *User = cast(U.getUser()); + if (PHINode *UserPN = dyn_cast(User)) { + if (UserPN->getIncomingBlock(U) == BB) + continue; + } else if (User->getParent() == BB) { + continue; + } + + UsesToRename.push_back(&U); + } + + // If there are no uses outside the block, we're done with this + // instruction. + if (UsesToRename.empty()) + continue; + LLVM_DEBUG(dbgs() << "DFA-JT: Renaming non-local uses of: " << *I + << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are + // outside its block to be uses of the appropriate PHI node etc. See + // ValuesInBlocks with the values we know. + unsigned VarNum = SSAUpdate.AddVariable(I->getName(), I->getType()); + SSAUpdate.AddAvailableValue(VarNum, BB, I); + for (Instruction *New : Cloned) + SSAUpdate.AddAvailableValue(VarNum, New->getParent(), New); + + while (!UsesToRename.empty()) + SSAUpdate.AddUse(VarNum, UsesToRename.pop_back_val()); + + LLVM_DEBUG(dbgs() << "\n"); + } + // SSAUpdater handles phi placement and renaming uses with the appropriate + // value. + SSAUpdate.RewriteAllUses(DT); + } + + /// Clones a basic block, and adds it to the CFG. + /// + /// This function also includes updating phi nodes in the successors of the + /// BB, and remapping uses that were defined locally in the cloned BB. + BasicBlock *cloneBlockAndUpdatePredecessor(BasicBlock *BB, BasicBlock *PrevBB, + uint64_t NextState, + DuplicateBlockMap &DuplicateMap, + DefMap &NewDefs, + DomTreeUpdater *DTU) { + ValueToValueMapTy VMap; + BasicBlock *NewBB = CloneBasicBlock( + BB, VMap, ".jt" + std::to_string(NextState), BB->getParent()); + NewBB->moveAfter(BB); + NumCloned++; + + for (Instruction &I : *NewBB) { + // Do not remap operands of PHINode in case a definition in BB is an + // incoming value to a phi in the same block. This incoming value will + // be renamed later while restoring SSA. + if (isa(&I)) + continue; + RemapInstruction(&I, VMap, + RF_IgnoreMissingLocals | RF_NoModuleLevelChanges); + if (AssumeInst *II = dyn_cast(&I)) + AC->registerAssumption(II); + } + + updateSuccessorPhis(BB, NewBB, NextState, VMap, DuplicateMap); + updatePredecessor(PrevBB, BB, NewBB, DTU); + updateDefMap(NewDefs, VMap); + + // Add all successors to the DominatorTree + SmallPtrSet SuccSet; + for (auto *SuccBB : successors(NewBB)) { + if (SuccSet.insert(SuccBB).second) + DTU->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB}}); + } + SuccSet.clear(); + return NewBB; + } + + /// Update the phi nodes in BB's successors. + /// + /// This means creating a new incoming value from NewBB with the new + /// instruction wherever there is an incoming value from BB. + void updateSuccessorPhis(BasicBlock *BB, BasicBlock *ClonedBB, + uint64_t NextState, ValueToValueMapTy &VMap, + DuplicateBlockMap &DuplicateMap) { + std::vector BlocksToUpdate; + + // If BB is the last block in the path, we can simply update the one case + // successor that will be reached. + if (BB == SwitchPaths->getSwitchBlock()) { + SwitchInst *Switch = SwitchPaths->getSwitchInst(); + BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); + BlocksToUpdate.push_back(NextCase); + BasicBlock *ClonedSucc = getClonedBB(NextCase, NextState, DuplicateMap); + if (ClonedSucc) + BlocksToUpdate.push_back(ClonedSucc); + } + // Otherwise update phis in all successors. + else { + for (BasicBlock *Succ : successors(BB)) { + BlocksToUpdate.push_back(Succ); + + // Check if a successor has already been cloned for the particular exit + // value. In this case if a successor was already cloned, the phi nodes + // in the cloned block should be updated directly. + BasicBlock *ClonedSucc = getClonedBB(Succ, NextState, DuplicateMap); + if (ClonedSucc) + BlocksToUpdate.push_back(ClonedSucc); + } + } + + // If there is a phi with an incoming value from BB, create a new incoming + // value for the new predecessor ClonedBB. The value will either be the same + // value from BB or a cloned value. + for (BasicBlock *Succ : BlocksToUpdate) { + for (auto II = Succ->begin(); PHINode *Phi = dyn_cast(II); + ++II) { + Value *Incoming = Phi->getIncomingValueForBlock(BB); + if (Incoming) { + if (isa(Incoming)) { + Phi->addIncoming(Incoming, ClonedBB); + continue; + } + Value *ClonedVal = VMap[Incoming]; + if (ClonedVal) + Phi->addIncoming(ClonedVal, ClonedBB); + else + Phi->addIncoming(Incoming, ClonedBB); + } + } + } + } + + /// Sets the successor of PrevBB to be NewBB instead of OldBB. Note that all + /// other successors are kept as well. + void updatePredecessor(BasicBlock *PrevBB, BasicBlock *OldBB, + BasicBlock *NewBB, DomTreeUpdater *DTU) { + // When a path is reused, there is a chance that predecessors were already + // updated before. Check if the predecessor needs to be updated first. + if (!isPredecessor(OldBB, PrevBB)) + return; + + Instruction *PrevTerm = PrevBB->getTerminator(); + for (unsigned Idx = 0; Idx < PrevTerm->getNumSuccessors(); Idx++) { + if (PrevTerm->getSuccessor(Idx) == OldBB) { + OldBB->removePredecessor(PrevBB, /* KeepOneInputPHIs = */ true); + PrevTerm->setSuccessor(Idx, NewBB); + } + } + DTU->applyUpdates({{DominatorTree::Delete, PrevBB, OldBB}, + {DominatorTree::Insert, PrevBB, NewBB}}); + } + + /// Add new value mappings to the DefMap to keep track of all new definitions + /// for a particular instruction. These will be used while updating SSA form. + void updateDefMap(DefMap &NewDefs, ValueToValueMapTy &VMap) { + for (auto Entry : VMap) { + Instruction *Inst = + dyn_cast(const_cast(Entry.first)); + if (!Inst || !Entry.second || isa(Inst) || + isa(Inst)) { + continue; + } + + Instruction *Cloned = dyn_cast(Entry.second); + if (!Cloned) + continue; + + if (NewDefs.find(Inst) == NewDefs.end()) + NewDefs[Inst] = {Cloned}; + else + NewDefs[Inst].push_back(Cloned); + } + } + + /// Update the last branch of a particular cloned path to point to the correct + /// case successor. + /// + /// Note that this is an optional step and would have been done in later + /// optimizations, but it makes the CFG significantly easier to work with. + void updateLastSuccessor(ThreadingPath &TPath, + DuplicateBlockMap &DuplicateMap, + DomTreeUpdater *DTU) { + uint64_t NextState = TPath.getExitValue(); + BasicBlock *BB = TPath.getPath().back(); + BasicBlock *LastBlock = getClonedBB(BB, NextState, DuplicateMap); + + // Note multiple paths can end at the same block so check that it is not + // updated yet + if (!isa(LastBlock->getTerminator())) + return; + SwitchInst *Switch = cast(LastBlock->getTerminator()); + BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); + + std::vector DTUpdates; + SmallPtrSet SuccSet; + for (BasicBlock *Succ : successors(LastBlock)) { + if (Succ != NextCase && SuccSet.insert(Succ).second) + DTUpdates.push_back({DominatorTree::Delete, LastBlock, Succ}); + } + + Switch->eraseFromParent(); + BranchInst::Create(NextCase, LastBlock); + + DTU->applyUpdates(DTUpdates); + } + + /// After cloning blocks, some of the phi nodes have extra incoming values + /// that are no longer used. This function removes them. + void cleanPhiNodes(BasicBlock *BB) { + // If BB is no longer reachable, remove any remaining phi nodes + if (pred_empty(BB)) { + std::vector PhiToRemove; + for (auto II = BB->begin(); PHINode *Phi = dyn_cast(II); ++II) { + PhiToRemove.push_back(Phi); + } + for (PHINode *PN : PhiToRemove) { + PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + PN->eraseFromParent(); + } + return; + } + + // Remove any incoming values that come from an invalid predecessor + for (auto II = BB->begin(); PHINode *Phi = dyn_cast(II); ++II) { + std::vector BlocksToRemove; + for (BasicBlock *IncomingBB : Phi->blocks()) { + if (!isPredecessor(BB, IncomingBB)) + BlocksToRemove.push_back(IncomingBB); + } + for (BasicBlock *BB : BlocksToRemove) + Phi->removeIncomingValue(BB); + } + } + + /// Checks if BB was already cloned for a particular next state value. If it + /// was then it returns this cloned block, and otherwise null. + BasicBlock *getClonedBB(BasicBlock *BB, uint64_t NextState, + DuplicateBlockMap &DuplicateMap) { + CloneList ClonedBBs = DuplicateMap[BB]; + + // Find an entry in the CloneList with this NextState. If it exists then + // return the corresponding BB + auto It = llvm::find_if(ClonedBBs, [NextState](const ClonedBlock &C) { + return C.State == NextState; + }); + return It != ClonedBBs.end() ? (*It).BB : nullptr; + } + + /// Helper to get the successor corresponding to a particular case value for + /// a switch statement. + BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, uint64_t NextState) { + BasicBlock *NextCase = nullptr; + for (auto Case : Switch->cases()) { + if (Case.getCaseValue()->getZExtValue() == NextState) { + NextCase = Case.getCaseSuccessor(); + break; + } + } + if (!NextCase) + NextCase = Switch->getDefaultDest(); + return NextCase; + } + + /// Returns true if IncomingBB is a predecessor of BB. + bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { + return llvm::find(predecessors(BB), IncomingBB) != pred_end(BB); + } + + AllSwitchPaths *SwitchPaths; + DominatorTree *DT; + AssumptionCache *AC; + TargetTransformInfo *TTI; + OptimizationRemarkEmitter *ORE; + SmallPtrSet EphValues; + std::vector TPaths; +}; + +bool DFAJumpThreading::run(Function &F) { + LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n"); + + if (F.hasOptSize()) { + LLVM_DEBUG(dbgs() << "Skipping due to the 'minsize' attribute\n"); + return false; + } + + if (ClViewCfgBefore) + F.viewCFG(); + + SmallVector ThreadableLoops; + bool MadeChanges = false; + + for (BasicBlock &BB : F) { + auto *SI = dyn_cast(BB.getTerminator()); + if (!SI) + continue; + + LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() + << " is predictable\n"); + MainSwitch Switch(SI, ORE); + + if (!Switch.getInstr()) + continue; + + LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is a " + << "candidate for jump threading\n"); + LLVM_DEBUG(SI->dump()); + + unfoldSelectInstrs(DT, Switch.getSelectInsts()); + if (!Switch.getSelectInsts().empty()) + MadeChanges = true; + + AllSwitchPaths SwitchPaths(&Switch, ORE); + SwitchPaths.run(); + + if (SwitchPaths.getNumThreadingPaths() > 0) { + ThreadableLoops.push_back(SwitchPaths); + + // For the time being limit this optimization to occurring once in a + // function since it can change the CFG significantly. This is not a + // strict requirement but it can cause buggy behavior if there is an + // overlap of blocks in different opportunities. There is a lot of room to + // experiment with catching more opportunities here. + break; + } + } + + SmallPtrSet EphValues; + if (ThreadableLoops.size() > 0) + CodeMetrics::collectEphemeralValues(&F, AC, EphValues); + + for (AllSwitchPaths SwitchPaths : ThreadableLoops) { + TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); + Transform.run(); + MadeChanges = true; + } + +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Full)); + verifyFunction(F, &dbgs()); +#endif + + return MadeChanges; +} + +} // end anonymous namespace + +/// Integrate with the new Pass Manager +PreservedAnalyses DFAJumpThreadingPass::run(Function &F, + FunctionAnalysisManager &AM) { + AssumptionCache &AC = AM.getResult(F); + DominatorTree &DT = AM.getResult(F); + TargetTransformInfo &TTI = AM.getResult(F); + OptimizationRemarkEmitter ORE(&F); + + if (!DFAJumpThreading(&AC, &DT, &TTI, &ORE).run(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -60,6 +60,7 @@ initializeInferAddressSpacesPass(Registry); initializeInstSimplifyLegacyPassPass(Registry); initializeJumpThreadingPass(Registry); + initializeDFAJumpThreadingLegacyPassPass(Registry); initializeLegacyLICMPassPass(Registry); initializeLegacyLoopSinkPassPass(Registry); initializeLoopFuseLegacyPass(Registry); diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -dfa-jump-threading -sccp -simplifycfg %s | FileCheck %s + +; This test checks that a constant propagation is applied for a basic loop. +; Related to bug 44679. +define i32 @test(i32 %a) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 3 +; +entry: + br label %while.cond + +while.cond: + %num = phi i32 [ 0, %entry ], [ %add, %case1 ] + %state = phi i32 [ 1, %entry ], [ %state.next, %case1 ] + switch i32 %state, label %end [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + %state.next = phi i32 [ 3, %case2 ], [ 2, %while.cond ] + %add = add nsw i32 %num, %state + br label %while.cond + +case2: + br label %case1 + +end: + ret i32 %num +} diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll @@ -0,0 +1,180 @@ +; REQUIRES: asserts +; RUN: opt -S -dfa-jump-threading -debug-only=dfa-jump-threading -disable-output %s 2>&1 | FileCheck %s + +; This test checks that the analysis identifies all threadable paths in a +; simple CFG. A threadable path includes a list of basic blocks, the exit +; state, and the block that determines the next state. +; < path of BBs that form a cycle > [ state, determinator ] +define i32 @test1(i32 %num) { +; CHECK: < for.body for.inc > [ 1, for.inc ] +; CHECK-NEXT: < for.body case1 for.inc > [ 2, for.inc ] +; CHECK-NEXT: < for.body case2 for.inc > [ 1, for.inc ] +; CHECK-NEXT: < for.body case2 si.unfold.false for.inc > [ 2, for.inc ] +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + +; This test checks that the analysis finds threadable paths in a more +; complicated CFG. Here the FSM is represented as a nested loop, with +; fallthrough cases. +define i32 @test2(i32 %init) { +; CHECK: < loop.3 case2 > [ 3, loop.3 ] +; CHECK-NEXT: < loop.3 case2 loop.1.backedge loop.1 loop.2 > [ 1, loop.1 ] +; CHECK-NEXT: < loop.3 case2 loop.1.backedge si.unfold.false loop.1 loop.2 > [ 4, loop.1.backedge ] +; CHECK-NEXT: < loop.3 case3 loop.2.backedge loop.2 > [ 0, loop.2.backedge ] +; CHECK-NEXT: < loop.3 case3 case4 loop.2.backedge loop.2 > [ 3, loop.2.backedge ] +; CHECK-NEXT: < loop.3 case3 case4 loop.1.backedge loop.1 loop.2 > [ 1, loop.1 ] +; CHECK-NEXT: < loop.3 case3 case4 loop.1.backedge si.unfold.false loop.1 loop.2 > [ 2, loop.1.backedge ] +; CHECK-NEXT: < loop.3 case4 loop.2.backedge loop.2 > [ 3, loop.2.backedge ] +; CHECK-NEXT: < loop.3 case4 loop.1.backedge loop.1 loop.2 > [ 1, loop.1 ] +; CHECK-NEXT: < loop.3 case4 loop.1.backedge si.unfold.false loop.1 loop.2 > [ 2, loop.1.backedge ] +entry: + %cmp = icmp eq i32 %init, 0 + %sel = select i1 %cmp, i32 0, i32 2 + br label %loop.1 + +loop.1: + %state.1 = phi i32 [ %sel, %entry ], [ %state.1.be2, %loop.1.backedge ] + br label %loop.2 + +loop.2: + %state.2 = phi i32 [ %state.1, %loop.1 ], [ %state.2.be, %loop.2.backedge ] + br label %loop.3 + +loop.3: + %state = phi i32 [ %state.2, %loop.2 ], [ 3, %case2 ] + switch i32 %state, label %infloop.i [ + i32 2, label %case2 + i32 3, label %case3 + i32 4, label %case4 + i32 0, label %case0 + i32 1, label %case1 + ] + +case2: + br i1 %cmp, label %loop.3, label %loop.1.backedge + +case3: + br i1 %cmp, label %loop.2.backedge, label %case4 + +case4: + br i1 %cmp, label %loop.2.backedge, label %loop.1.backedge + +loop.1.backedge: + %state.1.be = phi i32 [ 2, %case4 ], [ 4, %case2 ] + %state.1.be2 = select i1 %cmp, i32 1, i32 %state.1.be + br label %loop.1 + +loop.2.backedge: + %state.2.be = phi i32 [ 3, %case4 ], [ 0, %case3 ] + br label %loop.2 + +case0: + br label %exit + +case1: + br label %exit + +infloop.i: + br label %infloop.i + +exit: + ret i32 0 +} + +declare void @baz() + +; Verify that having the switch block as a determinator is handled correctly. +define i32 @main() { +; CHECK: < bb43 bb59 bb3 bb31 bb41 > [ 77, bb43 ] +; CHECK-NEXT: < bb43 bb49 bb59 bb3 bb31 bb41 > [ 77, bb43 ] +bb: + %i = alloca [420 x i8], align 1 + %i2 = getelementptr inbounds [420 x i8], [420 x i8]* %i, i64 0, i64 390 + br label %bb3 + +bb3: ; preds = %bb59, %bb + %i4 = phi i8* [ %i2, %bb ], [ %i60, %bb59 ] + %i5 = phi i8 [ 77, %bb ], [ %i64, %bb59 ] + %i6 = phi i32 [ 2, %bb ], [ %i63, %bb59 ] + %i7 = phi i32 [ 26, %bb ], [ %i62, %bb59 ] + %i8 = phi i32 [ 25, %bb ], [ %i61, %bb59 ] + %i9 = icmp sgt i32 %i7, 2 + %i10 = select i1 %i9, i32 %i7, i32 2 + %i11 = add i32 %i8, 2 + %i12 = sub i32 %i11, %i10 + %i13 = mul nsw i32 %i12, 3 + %i14 = add nsw i32 %i13, %i6 + %i15 = sext i32 %i14 to i64 + %i16 = getelementptr inbounds i8, i8* %i4, i64 %i15 + %i17 = load i8, i8* %i16, align 1 + %i18 = icmp sgt i8 %i17, 0 + br i1 %i18, label %bb21, label %bb31 + +bb21: ; preds = %bb3 + br i1 true, label %bb59, label %bb43 + +bb59: ; preds = %bb49, %bb43, %bb31, %bb21 + %i60 = phi i8* [ %i44, %bb49 ], [ %i44, %bb43 ], [ %i34, %bb31 ], [ %i4, %bb21 ] + %i61 = phi i32 [ %i45, %bb49 ], [ %i45, %bb43 ], [ %i33, %bb31 ], [ %i8, %bb21 ] + %i62 = phi i32 [ %i47, %bb49 ], [ %i47, %bb43 ], [ %i32, %bb31 ], [ %i7, %bb21 ] + %i63 = phi i32 [ %i48, %bb49 ], [ %i48, %bb43 ], [ 2, %bb31 ], [ %i6, %bb21 ] + %i64 = phi i8 [ %i46, %bb49 ], [ %i46, %bb43 ], [ 77, %bb31 ], [ %i5, %bb21 ] + %i65 = icmp sgt i32 %i62, 0 + br i1 %i65, label %bb3, label %bb66 + +bb31: ; preds = %bb3 + %i32 = add nsw i32 %i7, -1 + %i33 = add nsw i32 %i8, -1 + %i34 = getelementptr inbounds i8, i8* %i4, i64 -15 + %i35 = icmp eq i8 %i5, 77 + br i1 %i35, label %bb59, label %bb41 + +bb41: ; preds = %bb31 + tail call void @baz() + br label %bb43 + +bb43: ; preds = %bb41, %bb21 + %i44 = phi i8* [ %i34, %bb41 ], [ %i4, %bb21 ] + %i45 = phi i32 [ %i33, %bb41 ], [ %i8, %bb21 ] + %i46 = phi i8 [ 77, %bb41 ], [ %i5, %bb21 ] + %i47 = phi i32 [ %i32, %bb41 ], [ %i7, %bb21 ] + %i48 = phi i32 [ 2, %bb41 ], [ %i6, %bb21 ] + tail call void @baz() + switch i8 %i5, label %bb59 [ + i8 68, label %bb49 + i8 73, label %bb49 + ] + +bb49: ; preds = %bb43, %bb43 + tail call void @baz() + br label %bb59 + +bb66: ; preds = %bb59 + ret i32 0 +} diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -dfa-jump-threading %s | FileCheck %s + +; These tests check that the DFA jump threading transformation is applied +; properly to two CFGs. It checks that blocks are cloned, branches are updated, +; and SSA form is restored. +define i32 @test1(i32 %num) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [ +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: for.body.jt2: +; CHECK-NEXT: [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ] +; CHECK-NEXT: [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ] +; CHECK-NEXT: br label [[CASE2]] +; CHECK: for.body.jt1: +; CHECK-NEXT: [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: br label [[CASE1]] +; CHECK: case1: +; CHECK-NEXT: [[COUNT2:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: case2: +; CHECK-NEXT: [[COUNT1:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[COUNT1]], 50 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE:%.*]] +; CHECK: si.unfold.false: +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 undef, 1 +; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]] +; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.inc.jt2: +; CHECK-NEXT: [[COUNT4:%.*]] = phi i32 [ [[COUNT1]], [[SI_UNFOLD_FALSE]] ], [ [[COUNT2]], [[CASE1]] ] +; CHECK-NEXT: [[STATE_NEXT_JT2]] = phi i32 [ 2, [[CASE1]] ], [ 2, [[SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[INC_JT2]] = add nsw i32 [[COUNT4]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]] +; CHECK: for.inc.jt1: +; CHECK-NEXT: [[COUNT3:%.*]] = phi i32 [ [[COUNT1]], [[CASE2]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[CASE2]] ], [ 1, [[FOR_BODY]] ] +; CHECK-NEXT: [[INC_JT1]] = add nsw i32 [[COUNT3]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + + +define i32 @test2(i32 %init) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INIT:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_1:%.*]], label [[SI_UNFOLD_FALSE1:%.*]] +; CHECK: si.unfold.false: +; CHECK-NEXT: br label [[LOOP_1]] +; CHECK: si.unfold.false.jt2: +; CHECK-NEXT: br label [[LOOP_1_JT2:%.*]] +; CHECK: si.unfold.false.jt4: +; CHECK-NEXT: br label [[LOOP_1_JT4:%.*]] +; CHECK: si.unfold.false1: +; CHECK-NEXT: br label [[LOOP_1]] +; CHECK: loop.1: +; CHECK-NEXT: [[STATE_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ undef, [[SI_UNFOLD_FALSE:%.*]] ], [ 2, [[SI_UNFOLD_FALSE1]] ] +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.1.jt2: +; CHECK-NEXT: [[STATE_1_JT2:%.*]] = phi i32 [ [[STATE_1_BE_JT2:%.*]], [[SI_UNFOLD_FALSE_JT2:%.*]] ] +; CHECK-NEXT: br label [[LOOP_2_JT2:%.*]] +; CHECK: loop.1.jt4: +; CHECK-NEXT: [[STATE_1_JT4:%.*]] = phi i32 [ [[STATE_1_BE_JT4:%.*]], [[SI_UNFOLD_FALSE_JT4:%.*]] ] +; CHECK-NEXT: br label [[LOOP_2_JT4:%.*]] +; CHECK: loop.1.jt1: +; CHECK-NEXT: [[STATE_1_JT1:%.*]] = phi i32 [ 1, [[LOOP_1_BACKEDGE:%.*]] ], [ 1, [[LOOP_1_BACKEDGE_JT4:%.*]] ], [ 1, [[LOOP_1_BACKEDGE_JT2:%.*]] ] +; CHECK-NEXT: br label [[LOOP_2_JT1:%.*]] +; CHECK: loop.2: +; CHECK-NEXT: [[STATE_2:%.*]] = phi i32 [ [[STATE_1]], [[LOOP_1]] ], [ undef, [[LOOP_2_BACKEDGE:%.*]] ] +; CHECK-NEXT: br label [[LOOP_3:%.*]] +; CHECK: loop.2.jt2: +; CHECK-NEXT: [[STATE_2_JT2:%.*]] = phi i32 [ [[STATE_1_JT2]], [[LOOP_1_JT2]] ] +; CHECK-NEXT: br label [[LOOP_3_JT2:%.*]] +; CHECK: loop.2.jt3: +; CHECK-NEXT: [[STATE_2_JT3:%.*]] = phi i32 [ [[STATE_2_BE_JT3:%.*]], [[LOOP_2_BACKEDGE_JT3:%.*]] ] +; CHECK-NEXT: br label [[LOOP_3_JT3:%.*]] +; CHECK: loop.2.jt0: +; CHECK-NEXT: [[STATE_2_JT0:%.*]] = phi i32 [ [[STATE_2_BE_JT0:%.*]], [[LOOP_2_BACKEDGE_JT0:%.*]] ] +; CHECK-NEXT: br label [[LOOP_3_JT0:%.*]] +; CHECK: loop.2.jt4: +; CHECK-NEXT: [[STATE_2_JT4:%.*]] = phi i32 [ [[STATE_1_JT4]], [[LOOP_1_JT4]] ] +; CHECK-NEXT: br label [[LOOP_3_JT4:%.*]] +; CHECK: loop.2.jt1: +; CHECK-NEXT: [[STATE_2_JT1:%.*]] = phi i32 [ [[STATE_1_JT1]], [[LOOP_1_JT1:%.*]] ] +; CHECK-NEXT: br label [[LOOP_3_JT1:%.*]] +; CHECK: loop.3: +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ [[STATE_2]], [[LOOP_2]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[INFLOOP_I:%.*]] [ +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: i32 3, label [[CASE3:%.*]] +; CHECK-NEXT: i32 4, label [[CASE4:%.*]] +; CHECK-NEXT: i32 0, label [[CASE0:%.*]] +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: ] +; CHECK: loop.3.jt2: +; CHECK-NEXT: [[STATE_JT2:%.*]] = phi i32 [ [[STATE_2_JT2]], [[LOOP_2_JT2]] ] +; CHECK-NEXT: br label [[CASE2]] +; CHECK: loop.3.jt0: +; CHECK-NEXT: [[STATE_JT0:%.*]] = phi i32 [ [[STATE_2_JT0]], [[LOOP_2_JT0:%.*]] ] +; CHECK-NEXT: br label [[CASE0]] +; CHECK: loop.3.jt4: +; CHECK-NEXT: [[STATE_JT4:%.*]] = phi i32 [ [[STATE_2_JT4]], [[LOOP_2_JT4]] ] +; CHECK-NEXT: br label [[CASE4]] +; CHECK: loop.3.jt1: +; CHECK-NEXT: [[STATE_JT1:%.*]] = phi i32 [ [[STATE_2_JT1]], [[LOOP_2_JT1]] ] +; CHECK-NEXT: br label [[CASE1]] +; CHECK: loop.3.jt3: +; CHECK-NEXT: [[STATE_JT3:%.*]] = phi i32 [ 3, [[CASE2]] ], [ [[STATE_2_JT3]], [[LOOP_2_JT3:%.*]] ] +; CHECK-NEXT: br label [[CASE3]] +; CHECK: case2: +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_3_JT3]], label [[LOOP_1_BACKEDGE_JT4]] +; CHECK: case3: +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_2_BACKEDGE_JT0]], label [[CASE4]] +; CHECK: case4: +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_2_BACKEDGE_JT3]], label [[LOOP_1_BACKEDGE_JT2]] +; CHECK: loop.1.backedge: +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_1_JT1]], label [[SI_UNFOLD_FALSE]] +; CHECK: loop.1.backedge.jt2: +; CHECK-NEXT: [[STATE_1_BE_JT2]] = phi i32 [ 2, [[CASE4]] ] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_1_JT1]], label [[SI_UNFOLD_FALSE_JT2]] +; CHECK: loop.1.backedge.jt4: +; CHECK-NEXT: [[STATE_1_BE_JT4]] = phi i32 [ 4, [[CASE2]] ] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_1_JT1]], label [[SI_UNFOLD_FALSE_JT4]] +; CHECK: loop.2.backedge: +; CHECK-NEXT: br label [[LOOP_2]] +; CHECK: loop.2.backedge.jt3: +; CHECK-NEXT: [[STATE_2_BE_JT3]] = phi i32 [ 3, [[CASE4]] ] +; CHECK-NEXT: br label [[LOOP_2_JT3]] +; CHECK: loop.2.backedge.jt0: +; CHECK-NEXT: [[STATE_2_BE_JT0]] = phi i32 [ 0, [[CASE3]] ] +; CHECK-NEXT: br label [[LOOP_2_JT0]] +; CHECK: case0: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: case1: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: infloop.i: +; CHECK-NEXT: br label [[INFLOOP_I]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %cmp = icmp eq i32 %init, 0 + %sel = select i1 %cmp, i32 0, i32 2 + br label %loop.1 + +loop.1: + %state.1 = phi i32 [ %sel, %entry ], [ %state.1.be2, %loop.1.backedge ] + br label %loop.2 + +loop.2: + %state.2 = phi i32 [ %state.1, %loop.1 ], [ %state.2.be, %loop.2.backedge ] + br label %loop.3 + +loop.3: + %state = phi i32 [ %state.2, %loop.2 ], [ 3, %case2 ] + switch i32 %state, label %infloop.i [ + i32 2, label %case2 + i32 3, label %case3 + i32 4, label %case4 + i32 0, label %case0 + i32 1, label %case1 + ] + +case2: + br i1 %cmp, label %loop.3, label %loop.1.backedge + +case3: + br i1 %cmp, label %loop.2.backedge, label %case4 + +case4: + br i1 %cmp, label %loop.2.backedge, label %loop.1.backedge + +loop.1.backedge: + %state.1.be = phi i32 [ 2, %case4 ], [ 4, %case2 ] + %state.1.be2 = select i1 %cmp, i32 1, i32 %state.1.be + br label %loop.1 + +loop.2.backedge: + %state.2.be = phi i32 [ 3, %case4 ], [ 0, %case3 ] + br label %loop.2 + +case0: + br label %exit + +case1: + br label %exit + +infloop.i: + br label %infloop.i + +exit: + ret i32 0 +} diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll @@ -0,0 +1,293 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -dfa-jump-threading %s | FileCheck %s + +; These tests check if selects are unfolded properly for jump threading +; opportunities. There are three different patterns to consider: +; 1) Both operands are constant and the false branch is unfolded by default +; 2) One operand is constant and the other is another select to be unfolded. In +; this case a single select is sunk to a new block to unfold. +; 3) Both operands are a select, and both should be sunk to new blocks. +define i32 @test1(i32 %num) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [ +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: for.body.jt2: +; CHECK-NEXT: [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ] +; CHECK-NEXT: [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ] +; CHECK-NEXT: br label [[CASE2]] +; CHECK: for.body.jt1: +; CHECK-NEXT: [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: br label [[CASE1]] +; CHECK: case1: +; CHECK-NEXT: [[COUNT2:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: case2: +; CHECK-NEXT: [[COUNT1:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[COUNT1]], 50 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE:%.*]] +; CHECK: si.unfold.false: +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 undef, 1 +; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]] +; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.inc.jt2: +; CHECK-NEXT: [[COUNT4:%.*]] = phi i32 [ [[COUNT1]], [[SI_UNFOLD_FALSE]] ], [ [[COUNT2]], [[CASE1]] ] +; CHECK-NEXT: [[STATE_NEXT_JT2]] = phi i32 [ 2, [[CASE1]] ], [ 2, [[SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[INC_JT2]] = add nsw i32 [[COUNT4]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]] +; CHECK: for.inc.jt1: +; CHECK-NEXT: [[COUNT3:%.*]] = phi i32 [ [[COUNT1]], [[CASE2]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[CASE2]] ], [ 1, [[FOR_BODY]] ] +; CHECK-NEXT: [[INC_JT1]] = add nsw i32 [[COUNT3]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp slt i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + +define i32 @test2(i32 %num) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [ +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: for.body.jt3: +; CHECK-NEXT: [[COUNT_JT3:%.*]] = phi i32 [ [[INC_JT3:%.*]], [[FOR_INC_JT3:%.*]] ] +; CHECK-NEXT: [[STATE_JT3:%.*]] = phi i32 [ [[STATE_NEXT_JT3:%.*]], [[FOR_INC_JT3]] ] +; CHECK-NEXT: br label [[FOR_INC_JT1]] +; CHECK: for.body.jt2: +; CHECK-NEXT: [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ] +; CHECK-NEXT: [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ] +; CHECK-NEXT: br label [[CASE2]] +; CHECK: for.body.jt1: +; CHECK-NEXT: [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: br label [[CASE1]] +; CHECK: case1: +; CHECK-NEXT: [[COUNT5:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP_C1:%.*]] = icmp slt i32 [[COUNT5]], 50 +; CHECK-NEXT: [[CMP2_C1:%.*]] = icmp slt i32 [[COUNT5]], 100 +; CHECK-NEXT: br i1 [[CMP2_C1]], label [[SI_UNFOLD_TRUE:%.*]], label [[FOR_INC_JT3]] +; CHECK: case2: +; CHECK-NEXT: [[COUNT3:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP_C2:%.*]] = icmp slt i32 [[COUNT3]], 50 +; CHECK-NEXT: [[CMP2_C2:%.*]] = icmp sgt i32 [[COUNT3]], 100 +; CHECK-NEXT: br i1 [[CMP2_C2]], label [[FOR_INC_JT3]], label [[SI_UNFOLD_FALSE:%.*]] +; CHECK: si.unfold.false: +; CHECK-NEXT: br i1 [[CMP_C2]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE1:%.*]] +; CHECK: si.unfold.false1: +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: si.unfold.true: +; CHECK-NEXT: br i1 [[CMP_C1]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE2:%.*]] +; CHECK: si.unfold.false2: +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 undef, 1 +; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]] +; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.inc.jt3: +; CHECK-NEXT: [[COUNT6:%.*]] = phi i32 [ [[COUNT3]], [[CASE2]] ], [ [[COUNT5]], [[CASE1]] ] +; CHECK-NEXT: [[STATE_NEXT_JT3]] = phi i32 [ 3, [[CASE1]] ], [ 3, [[CASE2]] ] +; CHECK-NEXT: [[INC_JT3]] = add nsw i32 [[COUNT6]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT3:%.*]] = icmp slt i32 [[INC_JT3]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT3]], label [[FOR_BODY_JT3:%.*]], label [[FOR_END]] +; CHECK: for.inc.jt2: +; CHECK-NEXT: [[COUNT7:%.*]] = phi i32 [ [[COUNT3]], [[SI_UNFOLD_FALSE1]] ], [ [[COUNT5]], [[SI_UNFOLD_FALSE2]] ] +; CHECK-NEXT: [[STATE_NEXT_JT2]] = phi i32 [ 2, [[SI_UNFOLD_FALSE1]] ], [ 2, [[SI_UNFOLD_FALSE2]] ] +; CHECK-NEXT: [[INC_JT2]] = add nsw i32 [[COUNT7]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]] +; CHECK: for.inc.jt1: +; CHECK-NEXT: [[COUNT4:%.*]] = phi i32 [ [[COUNT_JT3]], [[FOR_BODY_JT3]] ], [ [[COUNT3]], [[SI_UNFOLD_FALSE]] ], [ [[COUNT5]], [[SI_UNFOLD_TRUE]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[FOR_BODY]] ], [ 1, [[SI_UNFOLD_FALSE]] ], [ 1, [[SI_UNFOLD_TRUE]] ], [ 1, [[FOR_BODY_JT3]] ] +; CHECK-NEXT: [[INC_JT1]] = add nsw i32 [[COUNT4]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + %cmp.c1 = icmp slt i32 %count, 50 + %cmp2.c1 = icmp slt i32 %count, 100 + %state1.1 = select i1 %cmp.c1, i32 1, i32 2 + %state1.2 = select i1 %cmp2.c1, i32 %state1.1, i32 3 + br label %for.inc + +case2: + %cmp.c2 = icmp slt i32 %count, 50 + %cmp2.c2 = icmp sgt i32 %count, 100 + %state2.1 = select i1 %cmp.c2, i32 1, i32 2 + %state2.2 = select i1 %cmp2.c2, i32 3, i32 %state2.1 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %state1.2, %case1 ], [ %state2.2, %case2 ], [ 1, %for.body ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + +define i32 @test3(i32 %num) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [ +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: for.body.jt4: +; CHECK-NEXT: [[COUNT_JT4:%.*]] = phi i32 [ [[INC_JT4:%.*]], [[FOR_INC_JT4:%.*]] ] +; CHECK-NEXT: [[STATE_JT4:%.*]] = phi i32 [ [[STATE_NEXT_JT4:%.*]], [[FOR_INC_JT4]] ] +; CHECK-NEXT: br label [[FOR_INC_JT1]] +; CHECK: for.body.jt3: +; CHECK-NEXT: [[COUNT_JT3:%.*]] = phi i32 [ [[INC_JT3:%.*]], [[FOR_INC_JT3:%.*]] ] +; CHECK-NEXT: [[STATE_JT3:%.*]] = phi i32 [ [[STATE_NEXT_JT3:%.*]], [[FOR_INC_JT3]] ] +; CHECK-NEXT: br label [[FOR_INC_JT1]] +; CHECK: for.body.jt2: +; CHECK-NEXT: [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ] +; CHECK-NEXT: [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ] +; CHECK-NEXT: br label [[CASE2]] +; CHECK: for.body.jt1: +; CHECK-NEXT: [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: br label [[CASE1]] +; CHECK: case1: +; CHECK-NEXT: [[COUNT5:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: case2: +; CHECK-NEXT: [[COUNT4:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP_1:%.*]] = icmp slt i32 [[COUNT4]], 50 +; CHECK-NEXT: [[CMP_2:%.*]] = icmp slt i32 [[COUNT4]], 100 +; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[COUNT4]], 1 +; CHECK-NEXT: [[CMP_3:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[CMP_3]], label [[SI_UNFOLD_TRUE:%.*]], label [[SI_UNFOLD_FALSE:%.*]] +; CHECK: si.unfold.true: +; CHECK-NEXT: br i1 [[CMP_1]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE2:%.*]] +; CHECK: si.unfold.false: +; CHECK-NEXT: br i1 [[CMP_2]], label [[FOR_INC_JT3]], label [[SI_UNFOLD_FALSE1:%.*]] +; CHECK: si.unfold.false1: +; CHECK-NEXT: br label [[FOR_INC_JT4]] +; CHECK: si.unfold.false2: +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 undef, 1 +; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]] +; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.inc.jt4: +; CHECK-NEXT: [[STATE_NEXT_JT4]] = phi i32 [ 4, [[SI_UNFOLD_FALSE1]] ] +; CHECK-NEXT: [[INC_JT4]] = add nsw i32 [[COUNT4]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT4:%.*]] = icmp slt i32 [[INC_JT4]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT4]], label [[FOR_BODY_JT4:%.*]], label [[FOR_END]] +; CHECK: for.inc.jt3: +; CHECK-NEXT: [[STATE_NEXT_JT3]] = phi i32 [ 3, [[SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[INC_JT3]] = add nsw i32 [[COUNT4]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT3:%.*]] = icmp slt i32 [[INC_JT3]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT3]], label [[FOR_BODY_JT3:%.*]], label [[FOR_END]] +; CHECK: for.inc.jt2: +; CHECK-NEXT: [[COUNT6:%.*]] = phi i32 [ [[COUNT4]], [[SI_UNFOLD_FALSE2]] ], [ [[COUNT5]], [[CASE1]] ] +; CHECK-NEXT: [[STATE_NEXT_JT2]] = phi i32 [ 2, [[CASE1]] ], [ 2, [[SI_UNFOLD_FALSE2]] ] +; CHECK-NEXT: [[INC_JT2]] = add nsw i32 [[COUNT6]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]] +; CHECK: for.inc.jt1: +; CHECK-NEXT: [[COUNT3:%.*]] = phi i32 [ [[COUNT_JT4]], [[FOR_BODY_JT4]] ], [ [[COUNT_JT3]], [[FOR_BODY_JT3]] ], [ [[COUNT4]], [[SI_UNFOLD_TRUE]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[FOR_BODY]] ], [ 1, [[SI_UNFOLD_TRUE]] ], [ 1, [[FOR_BODY_JT3]] ], [ 1, [[FOR_BODY_JT4]] ] +; CHECK-NEXT: [[INC_JT1]] = add nsw i32 [[COUNT3]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp.1 = icmp slt i32 %count, 50 + %cmp.2 = icmp slt i32 %count, 100 + %0 = and i32 %count, 1 + %cmp.3 = icmp eq i32 %0, 0 + %sel.1 = select i1 %cmp.1, i32 1, i32 2 + %sel.2 = select i1 %cmp.2, i32 3, i32 4 + %sel.3 = select i1 %cmp.3, i32 %sel.1, i32 %sel.2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel.3, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} diff --git a/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll b/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -dfa-jump-threading -dfa-max-path-length=6 %s | FileCheck %s + +; Make the path +; <%for.body %case1 %case1.1 %case1.2 %case1.3 %case1.4 %for.inc %for.end> +; too long so that it is not jump-threaded. +define i32 @max_path_length(i32 %num) { +; CHECK-LABEL: @max_path_length( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[STATE_NEXT:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [ +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: for.body.jt2: +; CHECK-NEXT: [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ] +; CHECK-NEXT: [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ] +; CHECK-NEXT: br label [[CASE2]] +; CHECK: for.body.jt1: +; CHECK-NEXT: [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ] +; CHECK-NEXT: br label [[CASE1]] +; CHECK: case1: +; CHECK-NEXT: [[COUNT2:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[CASE1_1:%.*]] +; CHECK: case1.1: +; CHECK-NEXT: br label [[CASE1_2:%.*]] +; CHECK: case1.2: +; CHECK-NEXT: br label [[CASE1_3:%.*]] +; CHECK: case1.3: +; CHECK-NEXT: br label [[CASE1_4:%.*]] +; CHECK: case1.4: +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: case2: +; CHECK-NEXT: [[COUNT1:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[COUNT1]], 50 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE:%.*]] +; CHECK: si.unfold.false: +; CHECK-NEXT: br label [[FOR_INC_JT2]] +; CHECK: for.inc: +; CHECK-NEXT: [[STATE_NEXT]] = phi i32 [ 2, [[CASE1_4]] ] +; CHECK-NEXT: [[INC]] = add nsw i32 [[COUNT2]], 1 +; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]] +; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.inc.jt2: +; CHECK-NEXT: [[STATE_NEXT_JT2]] = phi i32 [ 2, [[SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[INC_JT2]] = add nsw i32 [[COUNT1]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]] +; CHECK: for.inc.jt1: +; CHECK-NEXT: [[COUNT3:%.*]] = phi i32 [ [[COUNT1]], [[CASE2]] ], [ [[COUNT]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[CASE2]] ], [ 1, [[FOR_BODY]] ] +; CHECK-NEXT: [[INC_JT1]] = add nsw i32 [[COUNT3]], 1 +; CHECK-NEXT: [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]] +; CHECK-NEXT: br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %case1.1 + +case1.1: + br label %case1.2 + +case1.2: + br label %case1.3 + +case1.3: + br label %case1.4 + +case1.4: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1.4 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} diff --git a/llvm/test/Transforms/DFAJumpThreading/negative.ll b/llvm/test/Transforms/DFAJumpThreading/negative.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/negative.ll @@ -0,0 +1,216 @@ +; RUN: opt -dfa-jump-threading -dfa-cost-threshold=25 -pass-remarks-missed='dfa-jump-threading' -pass-remarks-output=%t -disable-output %s +; RUN: FileCheck --input-file %t --check-prefix=REMARK %s +; RUN: opt -S -dfa-jump-threading %s | FileCheck %s + +; This negative test case checks that the optimization doesn't trigger +; when the code size cost is too high. +define i32 @negative1(i32 %num) { +; REMARK: NotProfitable +; REMARK-NEXT: negative1 +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + %add1 = add i32 %num, %num + %add2 = add i32 %add1, %add1 + %add3 = add i32 %add2, %add2 + %add4 = add i32 %add3, %add3 + %add5 = add i32 %add4, %add4 + %add6 = add i32 %add5, %add5 + %add7 = add i32 %add6, %add6 + %add8 = add i32 %add7, %add7 + %add9 = add i32 %add8, %add8 + %add10 = add i32 %add9, %add9 + %add11 = add i32 %add10, %add10 + %add12 = add i32 %add11, %add11 + %add13 = add i32 %add12, %add12 + %add14 = add i32 %add13, %add13 + %add15 = add i32 %add14, %add14 + %add16 = add i32 %add15, %add15 + %add17 = add i32 %add16, %add16 + %add18 = add i32 %add17, %add17 + %add19 = add i32 %add18, %add18 + %add20 = add i32 %add19, %add19 + %add21 = add i32 %add20, %add20 + %add22 = add i32 %add21, %add21 + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 %add22 +} + +declare void @func() + +define i32 @negative2(i32 %num) { +; REMARK: NonDuplicatableInst +; REMARK-NEXT: negative2 +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + call void @func() noduplicate + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + +define i32 @negative3(i32 %num) { +; REMARK: ConvergentInst +; REMARK-NEXT: negative3 +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + call void @func() convergent + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + +define i32 @negative4(i32 %num) { +; REMARK: SwitchNotPredictable +; REMARK-NEXT: negative4 +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + ; the switch variable is not predictable since the exit value for %case1 + ; is defined through a non-instruction (function argument). + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ %num, %case1 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +} + +; Do not optimize if marked minsize. +define i32 @negative5(i32 %num) minsize { +; CHECK-LABEL: @negative5( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[STATE_NEXT:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: switch i32 [[STATE]], label [[FOR_INC]] [ +; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 2, label [[CASE2:%.*]] +; CHECK-NEXT: ] +; CHECK: case1: +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: case2: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[COUNT]], 50 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 1, i32 2 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[STATE_NEXT]] = phi i32 [ [[SEL]], [[CASE2]] ], [ 1, [[FOR_BODY]] ], [ 2, [[CASE1]] ] +; CHECK-NEXT: [[INC]] = add nsw i32 [[COUNT]], 1 +; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]] +; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: + %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ] + switch i32 %state, label %for.inc [ + i32 1, label %case1 + i32 2, label %case2 + ] + +case1: + br label %for.inc + +case2: + %cmp = icmp eq i32 %count, 50 + %sel = select i1 %cmp, i32 1, i32 2 + br label %for.inc + +for.inc: + %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ] + %inc = add nsw i32 %count, 1 + %cmp.exit = icmp slt i32 %inc, %num + br i1 %cmp.exit, label %for.body, label %for.end + +for.end: + ret i32 0 +}