Index: llvm/include/llvm/InitializePasses.h
===================================================================
--- llvm/include/llvm/InitializePasses.h
+++ llvm/include/llvm/InitializePasses.h
@@ -124,6 +124,7 @@
 void initializeDAEPass(PassRegistry&);
 void initializeDAHPass(PassRegistry&);
 void initializeDCELegacyPassPass(PassRegistry&);
+void initializeDFAJumpThreadingLegacyPassPass(PassRegistry &);
 void initializeDSELegacyPassPass(PassRegistry&);
 void initializeDataFlowSanitizerLegacyPassPass(PassRegistry &);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
Index: llvm/include/llvm/LinkAllPasses.h
===================================================================
--- llvm/include/llvm/LinkAllPasses.h
+++ llvm/include/llvm/LinkAllPasses.h
@@ -174,6 +174,7 @@
       (void) llvm::createStripDeadPrototypesPass();
       (void) llvm::createTailCallEliminationPass();
       (void) llvm::createJumpThreadingPass();
+      (void) llvm::createDFAJumpThreadingPass();
       (void) llvm::createUnifyFunctionExitNodesPass();
       (void) llvm::createInstCountPass();
       (void) llvm::createConstantHoistingPass();
Index: llvm/include/llvm/Transforms/Scalar.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar.h
+++ llvm/include/llvm/Transforms/Scalar.h
@@ -253,6 +253,14 @@
 FunctionPass *createJumpThreadingPass(bool FreezeSelectCond = false,
                                       int Threshold = -1);
 
+//===----------------------------------------------------------------------===//
+//
+// DFAJumpThreading - When a switch statement inside a loop is used to
+// implement a deterministic finite automata we can jump thread the switch
+// statement reducing number of conditional jumps.
+//
+FunctionPass *createDFAJumpThreadingPass();
+
 //===----------------------------------------------------------------------===//
 //
 // CFGSimplification - Merge basic blocks, eliminate unreachable blocks,
Index: llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h
===================================================================
--- /dev/null
+++ llvm/include/llvm/Transforms/Scalar/DFAJumpThreading.h
@@ -0,0 +1,27 @@
+//===- DFAJumpThreading.h - Threads a switch statement inside a loop ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the DFAJumpThreading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H
+#define LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct DFAJumpThreadingPass : PassInfoMixin<DFAJumpThreadingPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_DFAJUMPTHREADING_H
Index: llvm/lib/Passes/PassBuilder.cpp
===================================================================
--- llvm/lib/Passes/PassBuilder.cpp
+++ llvm/lib/Passes/PassBuilder.cpp
@@ -145,6 +145,7 @@
 #include "llvm/Transforms/Scalar/ConstraintElimination.h"
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
 #include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/Transforms/Scalar/DFAJumpThreading.h"
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
@@ -303,6 +304,7 @@
 extern cl::opt<bool> EnableLoopInterchange;
 extern cl::opt<bool> EnableUnrollAndJam;
 extern cl::opt<bool> EnableLoopFlatten;
+extern cl::opt<bool> EnableDFAJumpThreading;
 extern cl::opt<bool> RunNewGVN;
 extern cl::opt<bool> RunPartialInlining;
 extern cl::opt<bool> ExtraVectorizerPasses;
@@ -827,6 +829,9 @@
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
+  if (EnableDFAJumpThreading && Level.getSizeLevel() == 0)
+    FPM.addPass(DFAJumpThreadingPass());
+
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
 
Index: llvm/lib/Passes/PassRegistry.def
===================================================================
--- llvm/lib/Passes/PassRegistry.def
+++ llvm/lib/Passes/PassRegistry.def
@@ -212,6 +212,7 @@
 FUNCTION_PASS("coro-cleanup", CoroCleanupPass())
 FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
 FUNCTION_PASS("dce", DCEPass())
+FUNCTION_PASS("dfa-jump-threading", DFAJumpThreadingPass())
 FUNCTION_PASS("div-rem-pairs", DivRemPairsPass())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dot-cfg", CFGPrinterPass())
Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -99,6 +99,10 @@
                                 cl::Hidden,
                                 cl::desc("Enable the LoopFlatten Pass"));
 
+cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread",
+                                     cl::desc("Enable DFA jump threading."),
+                                     cl::init(false), cl::Hidden);
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -500,6 +504,9 @@
   MPM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, MPM);
   if (OptLevel > 1) {
+    if (EnableDFAJumpThreading && SizeLevel == 0)
+      MPM.add(createDFAJumpThreadingPass());
+
     MPM.add(createJumpThreadingPass());         // Thread jumps
     MPM.add(createCorrelatedValuePropagationPass());
   }
Index: llvm/lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -9,6 +9,7 @@
   CorrelatedValuePropagation.cpp
   DCE.cpp
   DeadStoreElimination.cpp
+  DFAJumpThreading.cpp
   DivRemPairs.cpp
   EarlyCSE.cpp
   FlattenCFGPass.cpp
Index: llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -0,0 +1,1287 @@
+//===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Transform each threading path to effectively jump thread the DFA. For
+// example, the CFG below could be transformed as follows, where the cloned
+// blocks unconditionally branch to the next correct case based on what is
+// identified in the analysis.
+//
+//          sw.bb                        sw.bb
+//        /   |   \                    /   |   \
+//   case1  case2  case3          case1  case2  case3
+//        \   |   /                 |      |      |
+//       determinator            det.2   det.3  det.1
+//        br sw.bb                /        |        \
+//                          sw.bb.2     sw.bb.3     sw.bb.1
+//                           br case2    br case3    br case1§
+//
+// Definitions and Terminology:
+//
+// * Threading path:
+//   a list of basic blocks, the exit state, and the block that determines
+//   the next state, for which the following notation will be used:
+//   < path of BBs that form a cycle > [ state, determinator ]
+//
+// * Predictable switch:
+//   The switch variable is always a known constant so that all conditional
+//   jumps based on switch variable can be converted to unconditional jump.
+//
+// * Determinator:
+//   The basic block that determines the next state of the DFA.
+//
+// Representing the optimization in C-like pseudocode: the code pattern on the
+// left could functionally be transformed to the right pattern if the switch
+// condition is predictable.
+//
+//  X = A                       goto A
+//  for (...)                   A:
+//    switch (X)                  ...
+//      case A                    goto B
+//        X = B                 B:
+//      case B                    ...
+//        X = C                   goto C
+//
+// The pass first checks that switch variable X is decided by the control flow
+// path taken in the loop; for example, in case B, the next value of X is
+// decided to be C. It then enumerates through all paths in the loop and labels
+// the basic blocks where the next state is decided.
+//
+// Using this information it creates new paths that unconditionally branch to
+// the next case. This involves cloning code, so it only gets triggered if the
+// amount of code duplicated is below a threshold.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DFAJumpThreading.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dfa-jump-threading"
+
+STATISTIC(NumTransforms, "Number of transformations done");
+STATISTIC(NumCloned, "Number of blocks cloned");
+STATISTIC(NumPaths, "Number of individual paths threaded");
+
+static cl::opt<bool>
+    ClViewCfgBefore("dfa-jump-view-cfg-before",
+                    cl::desc("View the CFG before DFA Jump Threading"),
+                    cl::Hidden, cl::init(false));
+
+static cl::opt<unsigned> MaxPathLength(
+    "dfa-max-path-length",
+    cl::desc("Max number of blocks searched to find a threading path"),
+    cl::Hidden, cl::init(20));
+
+static cl::opt<unsigned>
+    CostThreshold("dfa-cost-threshold",
+                  cl::desc("Maximum cost accepted for the transformation"),
+                  cl::Hidden, cl::init(50));
+
+namespace {
+
+class SelectInstToUnfold {
+  SelectInst *SI;
+  PHINode *SIUse;
+
+public:
+  SelectInstToUnfold(SelectInst *SI, PHINode *SIUse) : SI(SI), SIUse(SIUse) {}
+
+  SelectInst *getInst() { return SI; }
+  PHINode *getUse() { return SIUse; }
+
+  explicit operator bool() const { return SI && SIUse; }
+};
+
+void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
+            std::vector<SelectInstToUnfold> *NewSIsToUnfold,
+            std::vector<BasicBlock *> *NewBBs);
+
+class DFAJumpThreading {
+public:
+  DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT,
+                   TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE)
+      : AC(AC), DT(DT), TTI(TTI), ORE(ORE) {}
+
+  bool run(Function &F);
+
+private:
+  void
+  unfoldSelectInstrs(DominatorTree *DT,
+                     const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+    SmallVector<SelectInstToUnfold, 4> Stack;
+    for (SelectInstToUnfold SIToUnfold : SelectInsts)
+      Stack.push_back(SIToUnfold);
+
+    while (!Stack.empty()) {
+      SelectInstToUnfold SIToUnfold = Stack.back();
+      Stack.pop_back();
+
+      std::vector<SelectInstToUnfold> NewSIsToUnfold;
+      std::vector<BasicBlock *> NewBBs;
+      unfold(&DTU, SIToUnfold, &NewSIsToUnfold, &NewBBs);
+
+      // Put newly discovered select instructions into the work list.
+      for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold)
+        Stack.push_back(NewSIToUnfold);
+    }
+  }
+
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  TargetTransformInfo *TTI;
+  OptimizationRemarkEmitter *ORE;
+};
+
+class DFAJumpThreadingLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification
+  DFAJumpThreadingLegacyPass() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    OptimizationRemarkEmitter *ORE =
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+    return DFAJumpThreading(AC, DT, TTI, ORE).run(F);
+  }
+};
+} // end anonymous namespace
+
+char DFAJumpThreadingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DFAJumpThreadingLegacyPass, "dfa-jump-threading",
+                      "DFA Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(DFAJumpThreadingLegacyPass, "dfa-jump-threading",
+                    "DFA Jump Threading", false, false)
+
+// Public interface to the DFA Jump Threading pass
+FunctionPass *llvm::createDFAJumpThreadingPass() {
+  return new DFAJumpThreadingLegacyPass();
+}
+
+namespace {
+
+/// Create a new basic block and sink \p SIToSink into it.
+void createBasicBlockAndSinkSelectInst(
+    DomTreeUpdater *DTU, SelectInst *SI, PHINode *SIUse, SelectInst *SIToSink,
+    BasicBlock *EndBlock, StringRef NewBBName, BasicBlock **NewBlock,
+    BranchInst **NewBranch, std::vector<SelectInstToUnfold> *NewSIsToUnfold,
+    std::vector<BasicBlock *> *NewBBs) {
+  assert(SIToSink->hasOneUse());
+  assert(NewBlock);
+  assert(NewBranch);
+  *NewBlock = BasicBlock::Create(SI->getContext(), NewBBName,
+                                 EndBlock->getParent(), EndBlock);
+  NewBBs->push_back(*NewBlock);
+  *NewBranch = BranchInst::Create(EndBlock, *NewBlock);
+  SIToSink->moveBefore(*NewBranch);
+  NewSIsToUnfold->push_back(SelectInstToUnfold(SIToSink, SIUse));
+  DTU->applyUpdates({{DominatorTree::Insert, *NewBlock, EndBlock}});
+}
+
+/// Unfold the select instruction held in \p SIToUnfold by replacing it with
+/// control flow.
+///
+/// Put newly discovered select instructions into \p NewSIsToUnfold. Put newly
+/// created basic blocks into \p NewBBs.
+///
+/// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible.
+void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
+            std::vector<SelectInstToUnfold> *NewSIsToUnfold,
+            std::vector<BasicBlock *> *NewBBs) {
+  SelectInst *SI = SIToUnfold.getInst();
+  PHINode *SIUse = SIToUnfold.getUse();
+  BasicBlock *StartBlock = SI->getParent();
+  BasicBlock *EndBlock = SIUse->getParent();
+  BranchInst *StartBlockTerm =
+      dyn_cast<BranchInst>(StartBlock->getTerminator());
+
+  assert(StartBlockTerm && StartBlockTerm->isUnconditional());
+  assert(SI->hasOneUse());
+
+  // These are the new basic blocks for the conditional branch.
+  // At least one will become an actual new basic block.
+  BasicBlock *TrueBlock = nullptr;
+  BasicBlock *FalseBlock = nullptr;
+  BranchInst *TrueBranch = nullptr;
+  BranchInst *FalseBranch = nullptr;
+
+  // Sink select instructions to be able to unfold them later.
+  if (SelectInst *SIOp = dyn_cast<SelectInst>(SI->getTrueValue())) {
+    createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIOp, EndBlock,
+                                      "si.unfold.true", &TrueBlock, &TrueBranch,
+                                      NewSIsToUnfold, NewBBs);
+  }
+  if (SelectInst *SIOp = dyn_cast<SelectInst>(SI->getFalseValue())) {
+    createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIOp, EndBlock,
+                                      "si.unfold.false", &FalseBlock,
+                                      &FalseBranch, NewSIsToUnfold, NewBBs);
+  }
+
+  // If there was nothing to sink, then arbitrarily choose the 'false' side
+  // for a new input value to the PHI.
+  if (!TrueBlock && !FalseBlock) {
+    FalseBlock = BasicBlock::Create(SI->getContext(), "si.unfold.false",
+                                    EndBlock->getParent(), EndBlock);
+    NewBBs->push_back(FalseBlock);
+    BranchInst::Create(EndBlock, FalseBlock);
+    DTU->applyUpdates({{DominatorTree::Insert, FalseBlock, EndBlock}});
+  }
+
+  // Insert the real conditional branch based on the original condition.
+  // If we did not create a new block for one of the 'true' or 'false' paths
+  // of the condition, it means that side of the branch goes to the end block
+  // directly and the path originates from the start block from the point of
+  // view of the new PHI.
+  BasicBlock *TT = EndBlock;
+  BasicBlock *FT = EndBlock;
+  if (TrueBlock && FalseBlock) {
+    // A diamond.
+    TT = TrueBlock;
+    FT = FalseBlock;
+
+    // Update the phi node of SI.
+    SIUse->removeIncomingValue(StartBlock, /* DeletePHIIfEmpty = */ false);
+    SIUse->addIncoming(SI->getTrueValue(), TrueBlock);
+    SIUse->addIncoming(SI->getFalseValue(), FalseBlock);
+
+    // Update any other PHI nodes in EndBlock.
+    for (PHINode &Phi : EndBlock->phis()) {
+      if (&Phi != SIUse) {
+        Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), TrueBlock);
+        Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), FalseBlock);
+      }
+    }
+  } else {
+    BasicBlock *NewBlock = nullptr;
+    Value *SIOp1 = SI->getTrueValue();
+    Value *SIOp2 = SI->getFalseValue();
+
+    // A triangle pointing right.
+    if (!TrueBlock) {
+      NewBlock = FalseBlock;
+      FT = FalseBlock;
+    }
+    // A triangle pointing left.
+    else {
+      NewBlock = TrueBlock;
+      TT = TrueBlock;
+      std::swap(SIOp1, SIOp2);
+    }
+
+    // Update the phi node of SI.
+    for (unsigned Idx = 0; Idx < SIUse->getNumIncomingValues(); ++Idx) {
+      if (SIUse->getIncomingBlock(Idx) == StartBlock)
+        SIUse->setIncomingValue(Idx, SIOp1);
+    }
+    SIUse->addIncoming(SIOp2, NewBlock);
+
+    // Update any other PHI nodes in EndBlock.
+    for (auto II = EndBlock->begin(); PHINode *Phi = dyn_cast<PHINode>(II);
+         ++II) {
+      if (Phi != SIUse)
+        Phi->addIncoming(Phi->getIncomingValueForBlock(StartBlock), NewBlock);
+    }
+  }
+  StartBlockTerm->eraseFromParent();
+  BranchInst::Create(TT, FT, SI->getCondition(), StartBlock);
+  DTU->applyUpdates({{DominatorTree::Insert, StartBlock, TT},
+                     {DominatorTree::Insert, StartBlock, FT}});
+
+  // The select is now dead.
+  SI->eraseFromParent();
+}
+
+struct ClonedBlock {
+  BasicBlock *BB;
+  uint64_t State; ///< \p State corresponds to the next value of a switch stmnt.
+};
+
+typedef std::deque<BasicBlock *> PathType;
+typedef std::vector<PathType> PathsType;
+typedef std::set<const BasicBlock *> VisitedBlocks;
+typedef std::vector<ClonedBlock> CloneList;
+
+// This data structure keeps track of all blocks that have been cloned.  If two
+// different ThreadingPaths clone the same block for a certain state it should
+// be reused, and it can be looked up in this map.
+typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap;
+
+// This map keeps track of all the new definitions for an instruction. This
+// information is needed when restoring SSA form after cloning blocks.
+typedef DenseMap<Instruction *, std::vector<Instruction *>> DefMap;
+
+inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
+  OS << "< ";
+  for (const BasicBlock *BB : Path) {
+    std::string BBName;
+    if (BB->hasName())
+      raw_string_ostream(BBName) << BB->getName();
+    else
+      raw_string_ostream(BBName) << BB;
+    OS << BBName << " ";
+  }
+  OS << ">";
+  return OS;
+}
+
+/// ThreadingPath is a path in the control flow of a loop that can be threaded
+/// by cloning necessary basic blocks and replacing conditional branches with
+/// unconditional ones. A threading path includes a list of basic blocks, the
+/// exit state, and the block that determines the next state.
+struct ThreadingPath {
+  /// Exit value is DFA's exit state for the given path.
+  uint64_t getExitValue() const { return ExitVal; }
+  void setExitValue(const ConstantInt *V) {
+    ExitVal = V->getZExtValue();
+    IsExitValSet = true;
+  }
+  bool isExitValueSet() const { return IsExitValSet; }
+
+  /// Determinator is the basic block that determines the next state of the DFA.
+  const BasicBlock *getDeterminatorBB() const { return DBB; }
+  void setDeterminator(const BasicBlock *BB) { DBB = BB; }
+
+  /// Path is a list of basic blocks.
+  const PathType &getPath() const { return Path; }
+  void setPath(const PathType &NewPath) { Path = NewPath; }
+
+  void print(raw_ostream &OS) const {
+    OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]";
+  }
+
+private:
+  PathType Path;
+  uint64_t ExitVal;
+  const BasicBlock *DBB = nullptr;
+  bool IsExitValSet = false;
+};
+
+#ifndef NDEBUG
+inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) {
+  TPath.print(OS);
+  return OS;
+}
+#endif
+
+struct MainSwitch {
+  MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) {
+    if (isPredictable(SI)) {
+      Instr = SI;
+    } else {
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable", SI)
+               << "Switch instruction is not predictable.";
+      });
+    }
+  }
+
+  virtual ~MainSwitch() = default;
+
+  SwitchInst *getInstr() const { return Instr; }
+  const SmallVector<SelectInstToUnfold, 4> getSelectInsts() {
+    return SelectInsts;
+  }
+
+private:
+  /// Do a use-def chain traversal. Make sure the value of the switch variable
+  /// is always a known constant. This means that all conditional jumps based on
+  /// switch variable can be converted to unconditional jumps.
+  bool isPredictable(const SwitchInst *SI) {
+    std::deque<Instruction *> Q;
+    SmallSet<Value *, 16> SeenValues;
+    SelectInsts.clear();
+
+    Value *FirstDef = SI->getOperand(0);
+    auto *Inst = dyn_cast<Instruction>(FirstDef);
+
+    // If this is a function argument or another non-instruction, then give up.
+    // We are interested in loop local variables.
+    if (!Inst)
+      return false;
+
+    // Require the first definition to be a PHINode
+    if (!isa<PHINode>(Inst))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "\tisPredictable() FirstDef: " << *Inst << "\n");
+
+    Q.push_back(Inst);
+    SeenValues.insert(FirstDef);
+
+    while (!Q.empty()) {
+      Instruction *Current = Q.front();
+      Q.pop_front();
+
+      if (auto *Phi = dyn_cast<PHINode>(Current)) {
+        for (Value *Incoming : Phi->incoming_values()) {
+          if (!isPredictableValue(Incoming, SeenValues))
+            return false;
+          addInstToQueue(Incoming, Q, SeenValues);
+        }
+        LLVM_DEBUG(dbgs() << "\tisPredictable() phi: " << *Phi << "\n");
+      } else if (SelectInst *SelI = dyn_cast<SelectInst>(Current)) {
+        if (!isValidSelectInst(SelI))
+          return false;
+        if (!isPredictableValue(SelI->getTrueValue(), SeenValues) ||
+            !isPredictableValue(SelI->getFalseValue(), SeenValues)) {
+          return false;
+        }
+        addInstToQueue(SelI->getTrueValue(), Q, SeenValues);
+        addInstToQueue(SelI->getFalseValue(), Q, SeenValues);
+        LLVM_DEBUG(dbgs() << "\tisPredictable() select: " << *SelI << "\n");
+        if (auto *SelIUse = dyn_cast<PHINode>(SelI->user_back()))
+          SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse));
+      } else {
+        // If it is neither a phi nor a select, then we give up.
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool isPredictableValue(Value *InpVal, SmallSet<Value *, 16> &SeenValues) {
+    if (SeenValues.find(InpVal) != SeenValues.end())
+      return true;
+
+    if (isa<ConstantInt>(InpVal))
+      return true;
+
+    // If this is a function argument or another non-instruction, then give up.
+    if (!isa<Instruction>(InpVal))
+      return false;
+
+    return true;
+  }
+
+  void addInstToQueue(Value *Val, std::deque<Instruction *> &Q,
+                      SmallSet<Value *, 16> &SeenValues) {
+    if (SeenValues.find(Val) != SeenValues.end())
+      return;
+    if (Instruction *I = dyn_cast<Instruction>(Val))
+      Q.push_back(I);
+    SeenValues.insert(Val);
+  }
+
+  bool isValidSelectInst(SelectInst *SI) {
+    if (!SI->hasOneUse())
+      return false;
+
+    Instruction *SIUse = dyn_cast<Instruction>(SI->user_back());
+    // The use of the select inst should be either a phi or another select.
+    if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
+      return false;
+
+    BasicBlock *SIBB = SI->getParent();
+
+    // Currently, we can only expand select instructions in basic blocks with
+    // one successor.
+    BranchInst *SITerm = dyn_cast<BranchInst>(SIBB->getTerminator());
+    if (!SITerm || !SITerm->isUnconditional())
+      return false;
+
+    if (isa<PHINode>(SIUse) &&
+        SIBB->getSingleSuccessor() != dyn_cast<Instruction>(SIUse)->getParent())
+      return false;
+
+    // If select will not be sunk during unfolding, and it is in the same basic
+    // block as another state defining select, then cannot unfold both.
+    for (SelectInstToUnfold SIToUnfold : SelectInsts) {
+      SelectInst *PrevSI = SIToUnfold.getInst();
+      if (PrevSI->getTrueValue() != SI && PrevSI->getFalseValue() != SI &&
+          PrevSI->getParent() == SI->getParent())
+        return false;
+    }
+
+    return true;
+  }
+
+  SwitchInst *Instr = nullptr;
+  SmallVector<SelectInstToUnfold, 4> SelectInsts;
+};
+
+struct AllSwitchPaths {
+  AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE)
+      : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()),
+        ORE(ORE) {}
+
+  std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; }
+  unsigned getNumThreadingPaths() { return TPaths.size(); }
+  SwitchInst *getSwitchInst() { return Switch; }
+  BasicBlock *getSwitchBlock() { return SwitchBlock; }
+
+  void run() {
+    VisitedBlocks Visited;
+    PathsType LoopPaths = paths(SwitchBlock, Visited, /* PathDepth = */ 1);
+    StateDefMap StateDef = getStateDefMap();
+
+    for (PathType Path : LoopPaths) {
+      ThreadingPath TPath;
+
+      const BasicBlock *PrevBB = Path.back();
+      for (const BasicBlock *BB : Path) {
+        if (StateDef.count(BB) != 0) {
+          const PHINode *Phi = dyn_cast<PHINode>(StateDef[BB]);
+          assert(Phi && "Expected a state-defining instr to be a phi node.");
+
+          const Value *V = Phi->getIncomingValueForBlock(PrevBB);
+          if (const ConstantInt *C = dyn_cast<const ConstantInt>(V)) {
+            TPath.setExitValue(C);
+            TPath.setDeterminator(BB);
+            TPath.setPath(Path);
+          }
+        }
+
+        // Switch block is the determinator, this is the final exit value.
+        if (TPath.isExitValueSet() && BB == Path.front())
+          break;
+
+        PrevBB = BB;
+      }
+
+      if (TPath.isExitValueSet())
+        TPaths.push_back(TPath);
+    }
+  }
+
+private:
+  // Value: an instruction that defines a switch state;
+  // Key: the parent basic block of that instruction.
+  typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
+
+  PathsType paths(BasicBlock *BB, VisitedBlocks &Visited,
+                  unsigned PathDepth) const {
+    PathsType Res;
+
+    // Stop exploring paths after visiting MaxPathLength blocks
+    if (PathDepth > MaxPathLength) {
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(DEBUG_TYPE, "MaxPathLengthReached",
+                                          Switch)
+               << "Exploration stopped after visiting MaxPathLength="
+               << ore::NV("MaxPathLength", MaxPathLength) << " blocks.";
+      });
+      return Res;
+    }
+
+    Visited.insert(BB);
+
+    // Some blocks have multiple edges to the same successor, and this set
+    // is used to prevent a duplicate path from being generated
+    SmallSet<BasicBlock *, 4> Successors;
+
+    for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+      BasicBlock *Succ = *SI;
+
+      if (Successors.find(Succ) != Successors.end())
+        continue;
+      Successors.insert(Succ);
+
+      // Found a cycle through the SwitchBlock
+      if (Succ == SwitchBlock) {
+        Res.push_back({BB});
+        continue;
+      }
+
+      // We have encountered a cycle, do not get caught in it
+      if (Visited.find(Succ) != Visited.end())
+        continue;
+
+      PathsType SuccPaths = paths(Succ, Visited, PathDepth + 1);
+      for (PathType Path : SuccPaths) {
+        PathType NewPath(Path);
+        NewPath.push_front(BB);
+        Res.push_back(NewPath);
+      }
+    }
+    // This block could now be visited again from a different predecessor. Note
+    // that this will result in exponential runtime. Subpaths could possibly be
+    // cached but it takes a lot of memory to store them.
+    Visited.erase(BB);
+    return Res;
+  }
+
+  /// Walk the use-def chain and collect all the state-defining instructions.
+  StateDefMap getStateDefMap() const {
+    StateDefMap Res;
+
+    Value *FirstDef = Switch->getOperand(0);
+
+    assert(isa<PHINode>(FirstDef) && "After select unfolding, all state "
+                                     "definitions are expected to be phi "
+                                     "nodes.");
+
+    SmallVector<PHINode *, 8> Stack;
+    Stack.push_back(dyn_cast<PHINode>(FirstDef));
+    SmallSet<Value *, 16> SeenValues;
+
+    while (!Stack.empty()) {
+      PHINode *CurPhi = Stack.back();
+      Stack.pop_back();
+
+      Res[CurPhi->getParent()] = CurPhi;
+      SeenValues.insert(CurPhi);
+
+      for (Value *Incoming : CurPhi->incoming_values()) {
+        if (Incoming == FirstDef || isa<ConstantInt>(Incoming) ||
+            SeenValues.find(Incoming) != SeenValues.end()) {
+          continue;
+        }
+
+        assert(isa<PHINode>(Incoming) && "After select unfolding, all state "
+                                         "definitions are expected to be phi "
+                                         "nodes.");
+
+        Stack.push_back(cast<PHINode>(Incoming));
+      }
+    }
+
+    return Res;
+  }
+
+  SwitchInst *Switch;
+  BasicBlock *SwitchBlock;
+  OptimizationRemarkEmitter *ORE;
+  std::vector<ThreadingPath> TPaths;
+};
+
+struct TransformDFA {
+  TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT,
+               AssumptionCache *AC, TargetTransformInfo *TTI,
+               OptimizationRemarkEmitter *ORE,
+               SmallPtrSet<const Value *, 32> EphValues)
+      : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE),
+        EphValues(EphValues) {}
+
+  void run() {
+    if (isLegalAndProfitableToTransform()) {
+      createAllExitPaths();
+      NumTransforms++;
+    }
+  }
+
+private:
+  /// This function performs both a legality check and profitability check at
+  /// the same time since it is convenient to do so. It iterates through all
+  /// blocks that will be cloned, and keeps track of the duplication cost. It
+  /// also returns false if it is illegal to clone some required block.
+  bool isLegalAndProfitableToTransform() {
+    CodeMetrics Metrics;
+    SwitchInst *Switch = SwitchPaths->getSwitchInst();
+
+    // Note that DuplicateBlockMap is not being used as intended here. It is
+    // just being used to ensure (BB, State) pairs are only counted once.
+    DuplicateBlockMap DuplicateMap;
+
+    for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+      PathType PathBBs = TPath.getPath();
+      uint64_t NextState = TPath.getExitValue();
+      const BasicBlock *Determinator = TPath.getDeterminatorBB();
+
+      // Update Metrics for the Switch block, this is always cloned
+      BasicBlock *BB = SwitchPaths->getSwitchBlock();
+      BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap);
+      if (!VisitedBB) {
+        Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+        DuplicateMap[BB].push_back({BB, NextState});
+      }
+
+      // If the Switch block is the Determinator, then we can continue since
+      // this is the only block that is cloned and we already counted for it.
+      if (PathBBs.front() == Determinator)
+        continue;
+
+      // Otherwise update Metrics for all blocks that will be cloned. If any
+      // block is already cloned and would be reused, don't double count it.
+      auto DetIt = std::find(PathBBs.begin(), PathBBs.end(), Determinator);
+      for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) {
+        BB = *BBIt;
+        VisitedBB = getClonedBB(BB, NextState, DuplicateMap);
+        if (VisitedBB)
+          continue;
+        Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+        DuplicateMap[BB].push_back({BB, NextState});
+      }
+
+      if (Metrics.notDuplicatable) {
+        LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
+                          << "non-duplicatable instructions.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "NonDuplicatableInst",
+                                          Switch)
+                 << "Contains non-duplicatable instructions.";
+        });
+        return false;
+      }
+
+      if (Metrics.convergent) {
+        LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
+                          << "convergent instructions.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch)
+                 << "Contains convergent instructions.";
+        });
+        return false;
+      }
+    }
+
+    unsigned DuplicationCost = 0;
+
+    unsigned JumpTableSize = 0;
+    TTI->getEstimatedNumberOfCaseClusters(
+        *Switch, JumpTableSize, nullptr, nullptr);
+    if (JumpTableSize == 0) {
+      // Factor in the number of conditional branches reduced from jump
+      // threading.  Assume that lowering the switch block is implemented by
+      // using binary search, hence the LogBase2().
+      unsigned CondBranches =
+          APInt(32, Switch->getNumSuccessors()).ceilLogBase2();
+      DuplicationCost = Metrics.NumInsts / CondBranches;
+    } else {
+      // Compared with jump tables, the DFA optimizer removes an indirect branch
+      // on each loop iteration, thus making branch prediction more precise. The
+      // more branch targets there are, the more likely it is for the branch
+      // predictor to make a mistake, and the more benefit there is in the DFA
+      // optimizer. Thus, the more branch targets there are, the lower is the
+      // cost of the DFA opt.
+      DuplicationCost = Metrics.NumInsts / JumpTableSize;
+    }
+
+    LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block "
+                      << SwitchPaths->getSwitchBlock()->getName()
+                      << " is: " << DuplicationCost << "\n\n");
+
+    if (DuplicationCost > CostThreshold) {
+      LLVM_DEBUG(dbgs() << "Not jump threading, duplication cost exceeds the "
+                        << "cost threshold.\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch)
+               << "Duplication cost exceeds the cost threshold (cost="
+               << ore::NV("Cost", DuplicationCost)
+               << ", threshold=" << ore::NV("Threshold", CostThreshold) << ").";
+      });
+      return false;
+    }
+
+    ORE->emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "JumpThreaded", Switch)
+             << "Switch statement jump-threaded.";
+    });
+
+    return true;
+  }
+
+  /// Transform each threading path to effectively jump thread the DFA.
+  void createAllExitPaths() {
+    DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+    // Move the switch block to the end of the path, since it will be duplicated
+    BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock();
+    for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+      LLVM_DEBUG(dbgs() << TPath << "\n");
+      PathType NewPath(TPath.getPath());
+      NewPath.push_back(SwitchBlock);
+      TPath.setPath(NewPath);
+    }
+
+    // Transform the ThreadingPaths and keep track of the cloned values
+    DuplicateBlockMap DuplicateMap;
+    DefMap NewDefs;
+
+    SmallSet<BasicBlock *, 16> BlocksToClean;
+    for (BasicBlock *BB : successors(SwitchBlock))
+      BlocksToClean.insert(BB);
+
+    for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+      createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU);
+      NumPaths++;
+    }
+
+    // After all paths are cloned, now update the last successor of the cloned
+    // path so it skips over the switch statement
+    for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
+      updateLastSuccessor(TPath, DuplicateMap, &DTU);
+
+    // For each instruction that was cloned and used outside, update its uses
+    updateSSA(NewDefs);
+
+    // Clean PHI Nodes for the newly created blocks
+    for (BasicBlock *BB : BlocksToClean)
+      cleanPhiNodes(BB);
+  }
+
+  /// For a specific ThreadingPath \p Path, create an exit path starting from
+  /// the determinator block.
+  ///
+  /// To remember the correct destination, we have to duplicate blocks
+  /// corresponding to each state. Also update the terminating instruction of
+  /// the predecessors, and phis in the successor blocks.
+  void createExitPath(DefMap &NewDefs, ThreadingPath &Path,
+                      DuplicateBlockMap &DuplicateMap,
+                      SmallSet<BasicBlock *, 16> &BlocksToClean,
+                      DomTreeUpdater *DTU) {
+    uint64_t NextState = Path.getExitValue();
+    const BasicBlock *Determinator = Path.getDeterminatorBB();
+    PathType PathBBs = Path.getPath();
+
+    // Don't select the placeholder block in front
+    if (PathBBs.front() == Determinator)
+      PathBBs.pop_front();
+
+    auto DetIt = std::find(PathBBs.begin(), PathBBs.end(), Determinator);
+    auto Prev = std::prev(DetIt);
+    BasicBlock *PrevBB = *Prev;
+    for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) {
+      BasicBlock *BB = *BBIt;
+      BlocksToClean.insert(BB);
+
+      // We already cloned BB for this NextState, now just update the branch
+      // and continue.
+      BasicBlock *NextBB = getClonedBB(BB, NextState, DuplicateMap);
+      if (NextBB) {
+        updatePredecessor(PrevBB, BB, NextBB, DTU);
+        PrevBB = NextBB;
+        continue;
+      }
+
+      // Clone the BB and update the successor of Prev to jump to the new block
+      BasicBlock *NewBB = cloneBlockAndUpdatePredecessor(
+          BB, PrevBB, NextState, DuplicateMap, NewDefs, DTU);
+      DuplicateMap[BB].push_back({NewBB, NextState});
+      BlocksToClean.insert(NewBB);
+      PrevBB = NewBB;
+    }
+  }
+
+  /// Restore SSA form after cloning blocks.
+  ///
+  /// Each cloned block creates new defs for a variable, and the uses need to be
+  /// updated to reflect this. The uses may be replaced with a cloned value, or
+  /// some derived phi instruction. Note that all uses of a value defined in the
+  /// same block were already remapped when cloning the block.
+  void updateSSA(DefMap &NewDefs) {
+    SSAUpdaterBulk SSAUpdate;
+    SmallVector<Use *, 16> UsesToRename;
+
+    for (auto KV : NewDefs) {
+      Instruction *I = KV.first;
+      BasicBlock *BB = I->getParent();
+      std::vector<Instruction *> Cloned = KV.second;
+
+      // Scan all uses of this instruction to see if it is used outside of its
+      // block, and if so, record them in UsesToRename.
+      for (Use &U : I->uses()) {
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(U) == BB)
+            continue;
+        } else if (User->getParent() == BB) {
+          continue;
+        }
+
+        UsesToRename.push_back(&U);
+      }
+
+      // If there are no uses outside the block, we're done with this
+      // instruction.
+      if (UsesToRename.empty())
+        continue;
+      LLVM_DEBUG(dbgs() << "DFA-JT: Renaming non-local uses of: " << *I
+                        << "\n");
+
+      // We found a use of I outside of BB.  Rename all uses of I that are
+      // outside its block to be uses of the appropriate PHI node etc.  See
+      // ValuesInBlocks with the values we know.
+      unsigned VarNum = SSAUpdate.AddVariable(I->getName(), I->getType());
+      SSAUpdate.AddAvailableValue(VarNum, BB, I);
+      for (Instruction *New : Cloned)
+        SSAUpdate.AddAvailableValue(VarNum, New->getParent(), New);
+
+      while (!UsesToRename.empty())
+        SSAUpdate.AddUse(VarNum, UsesToRename.pop_back_val());
+
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+    // SSAUpdater handles phi placement and renaming uses with the appropriate
+    // value.
+    SSAUpdate.RewriteAllUses(DT);
+  }
+
+  /// Clones a basic block, and adds it to the CFG.
+  ///
+  /// This function also includes updating phi nodes in the successors of the
+  /// BB, and remapping uses that were defined locally in the cloned BB.
+  BasicBlock *cloneBlockAndUpdatePredecessor(BasicBlock *BB, BasicBlock *PrevBB,
+                                             uint64_t NextState,
+                                             DuplicateBlockMap &DuplicateMap,
+                                             DefMap &NewDefs,
+                                             DomTreeUpdater *DTU) {
+    ValueToValueMapTy VMap;
+    BasicBlock *NewBB = CloneBasicBlock(
+        BB, VMap, ".jt" + std::to_string(NextState), BB->getParent());
+    NewBB->moveAfter(BB);
+    NumCloned++;
+
+    for (Instruction &I : *NewBB) {
+      // Do not remap operands of PHINode in case a definition in BB is an
+      // incoming value to a phi in the same block. This incoming value will
+      // be renamed later while restoring SSA.
+      if (isa<PHINode>(&I))
+        continue;
+      RemapInstruction(&I, VMap,
+                       RF_IgnoreMissingLocals | RF_NoModuleLevelChanges);
+      if (AssumeInst *II = dyn_cast<AssumeInst>(&I))
+        AC->registerAssumption(II);
+    }
+
+    updateSuccessorPhis(BB, NewBB, NextState, VMap, DuplicateMap);
+    updatePredecessor(PrevBB, BB, NewBB, DTU);
+    updateDefMap(NewDefs, VMap);
+
+    // Add all successors to the DominatorTree
+    SmallPtrSet<BasicBlock *, 4> SuccSet;
+    for (auto *SuccBB : successors(NewBB)) {
+      if (SuccSet.insert(SuccBB).second)
+        DTU->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB}});
+    }
+    SuccSet.clear();
+    return NewBB;
+  }
+
+  /// Update the phi nodes in BB's successors.
+  ///
+  /// This means creating a new incoming value from NewBB with the new
+  /// instruction wherever there is an incoming value from BB.
+  void updateSuccessorPhis(BasicBlock *BB, BasicBlock *ClonedBB,
+                           uint64_t NextState, ValueToValueMapTy &VMap,
+                           DuplicateBlockMap &DuplicateMap) {
+    std::vector<BasicBlock *> BlocksToUpdate;
+
+    // If BB is the last block in the path, we can simply update the one case
+    // successor that will be reached.
+    if (BB == SwitchPaths->getSwitchBlock()) {
+      SwitchInst *Switch = SwitchPaths->getSwitchInst();
+      BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState);
+      BlocksToUpdate.push_back(NextCase);
+      BasicBlock *ClonedSucc = getClonedBB(NextCase, NextState, DuplicateMap);
+      if (ClonedSucc)
+        BlocksToUpdate.push_back(ClonedSucc);
+    }
+    // Otherwise update phis in all successors.
+    else {
+      for (BasicBlock *Succ : successors(BB)) {
+        BlocksToUpdate.push_back(Succ);
+
+        // Check if a successor has already been cloned for the particular exit
+        // value. In this case if a successor was already cloned, the phi nodes
+        // in the cloned block should be updated directly.
+        BasicBlock *ClonedSucc = getClonedBB(Succ, NextState, DuplicateMap);
+        if (ClonedSucc)
+          BlocksToUpdate.push_back(ClonedSucc);
+      }
+    }
+
+    // If there is a phi with an incoming value from BB, create a new incoming
+    // value for the new predecessor ClonedBB. The value will either be the same
+    // value from BB or a cloned value.
+    for (BasicBlock *Succ : BlocksToUpdate) {
+      for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(II);
+           ++II) {
+        Value *Incoming = Phi->getIncomingValueForBlock(BB);
+        if (Incoming) {
+          if (isa<Constant>(Incoming)) {
+            Phi->addIncoming(Incoming, ClonedBB);
+            continue;
+          }
+          Value *ClonedVal = VMap[Incoming];
+          if (ClonedVal)
+            Phi->addIncoming(ClonedVal, ClonedBB);
+          else
+            Phi->addIncoming(Incoming, ClonedBB);
+        }
+      }
+    }
+  }
+
+  /// Sets the successor of PrevBB to be NewBB instead of OldBB. Note that all
+  /// other successors are kept as well.
+  void updatePredecessor(BasicBlock *PrevBB, BasicBlock *OldBB,
+                         BasicBlock *NewBB, DomTreeUpdater *DTU) {
+    // When a path is reused, there is a chance that predecessors were already
+    // updated before. Check if the predecessor needs to be updated first.
+    if (!isPredecessor(OldBB, PrevBB))
+      return;
+
+    Instruction *PrevTerm = PrevBB->getTerminator();
+    for (unsigned Idx = 0; Idx < PrevTerm->getNumSuccessors(); Idx++) {
+      if (PrevTerm->getSuccessor(Idx) == OldBB) {
+        OldBB->removePredecessor(PrevBB, /* KeepOneInputPHIs = */ true);
+        PrevTerm->setSuccessor(Idx, NewBB);
+      }
+    }
+    DTU->applyUpdates({{DominatorTree::Delete, PrevBB, OldBB},
+                       {DominatorTree::Insert, PrevBB, NewBB}});
+  }
+
+  /// Add new value mappings to the DefMap to keep track of all new definitions
+  /// for a particular instruction. These will be used while updating SSA form.
+  void updateDefMap(DefMap &NewDefs, ValueToValueMapTy &VMap) {
+    for (auto Entry : VMap) {
+      Instruction *Inst =
+          dyn_cast<Instruction>(const_cast<Value *>(Entry.first));
+      if (!Inst || !Entry.second || isa<BranchInst>(Inst) ||
+          isa<SwitchInst>(Inst)) {
+        continue;
+      }
+
+      Instruction *Cloned = dyn_cast<Instruction>(Entry.second);
+      if (!Cloned)
+        continue;
+
+      if (NewDefs.find(Inst) == NewDefs.end())
+        NewDefs[Inst] = {Cloned};
+      else
+        NewDefs[Inst].push_back(Cloned);
+    }
+  }
+
+  /// Update the last branch of a particular cloned path to point to the correct
+  /// case successor.
+  ///
+  /// Note that this is an optional step and would have been done in later
+  /// optimizations, but it makes the CFG significantly easier to work with.
+  void updateLastSuccessor(ThreadingPath &TPath,
+                           DuplicateBlockMap &DuplicateMap,
+                           DomTreeUpdater *DTU) {
+    uint64_t NextState = TPath.getExitValue();
+    BasicBlock *BB = TPath.getPath().back();
+    BasicBlock *LastBlock = getClonedBB(BB, NextState, DuplicateMap);
+
+    // Note multiple paths can end at the same block so check that it is not
+    // updated yet
+    if (!isa<SwitchInst>(LastBlock->getTerminator()))
+      return;
+    SwitchInst *Switch = cast<SwitchInst>(LastBlock->getTerminator());
+    BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState);
+
+    std::vector<DominatorTree::UpdateType> DTUpdates;
+    SmallPtrSet<BasicBlock *, 4> SuccSet;
+    for (BasicBlock *Succ : successors(LastBlock)) {
+      if (Succ != NextCase && SuccSet.insert(Succ).second)
+        DTUpdates.push_back({DominatorTree::Delete, LastBlock, Succ});
+    }
+
+    Switch->eraseFromParent();
+    BranchInst::Create(NextCase, LastBlock);
+
+    DTU->applyUpdates(DTUpdates);
+  }
+
+  /// After cloning blocks, some of the phi nodes have extra incoming values
+  /// that are no longer used. This function removes them.
+  void cleanPhiNodes(BasicBlock *BB) {
+    // If BB is no longer reachable, remove any remaining phi nodes
+    if (pred_empty(BB)) {
+      std::vector<PHINode *> PhiToRemove;
+      for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) {
+        PhiToRemove.push_back(Phi);
+      }
+      for (PHINode *PN : PhiToRemove) {
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+        PN->eraseFromParent();
+      }
+      return;
+    }
+
+    // Remove any incoming values that come from an invalid predecessor
+    for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) {
+      std::vector<BasicBlock *> BlocksToRemove;
+      for (BasicBlock *IncomingBB : Phi->blocks()) {
+        if (!isPredecessor(BB, IncomingBB))
+          BlocksToRemove.push_back(IncomingBB);
+      }
+      for (BasicBlock *BB : BlocksToRemove)
+        Phi->removeIncomingValue(BB);
+    }
+  }
+
+  /// Checks if BB was already cloned for a particular next state value. If it
+  /// was then it returns this cloned block, and otherwise null.
+  BasicBlock *getClonedBB(BasicBlock *BB, uint64_t NextState,
+                          DuplicateBlockMap &DuplicateMap) {
+    CloneList ClonedBBs = DuplicateMap[BB];
+
+    // Find an entry in the CloneList with this NextState. If it exists then
+    // return the corresponding BB
+    auto It = llvm::find_if(ClonedBBs, [NextState](const ClonedBlock &C) {
+      return C.State == NextState;
+    });
+    return It != ClonedBBs.end() ? (*It).BB : nullptr;
+  }
+
+  /// Helper to get the successor corresponding to a particular case value for
+  /// a switch statement.
+  BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, uint64_t NextState) {
+    BasicBlock *NextCase = nullptr;
+    for (auto Case : Switch->cases()) {
+      if (Case.getCaseValue()->getZExtValue() == NextState) {
+        NextCase = Case.getCaseSuccessor();
+        break;
+      }
+    }
+    if (!NextCase)
+      NextCase = Switch->getDefaultDest();
+    return NextCase;
+  }
+
+  /// Returns true if IncomingBB is a predecessor of BB.
+  bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) {
+    return llvm::find(predecessors(BB), IncomingBB) != pred_end(BB);
+  }
+
+  AllSwitchPaths *SwitchPaths;
+  DominatorTree *DT;
+  AssumptionCache *AC;
+  TargetTransformInfo *TTI;
+  OptimizationRemarkEmitter *ORE;
+  SmallPtrSet<const Value *, 32> EphValues;
+  std::vector<ThreadingPath> TPaths;
+};
+
+bool DFAJumpThreading::run(Function &F) {
+  LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n");
+
+  if (F.hasOptSize()) {
+    LLVM_DEBUG(dbgs() << "Skipping due to the 'minsize' attribute\n");
+    return false;
+  }
+
+  if (ClViewCfgBefore)
+    F.viewCFG();
+
+  SmallVector<AllSwitchPaths, 2> ThreadableLoops;
+  bool MadeChanges = false;
+
+  for (BasicBlock &BB : F) {
+    auto *SI = dyn_cast<SwitchInst>(BB.getTerminator());
+    if (!SI)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName()
+                      << " is predictable\n");
+    MainSwitch Switch(SI, ORE);
+
+    if (!Switch.getInstr())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is a "
+                      << "candidate for jump threading\n");
+    LLVM_DEBUG(SI->dump());
+
+    unfoldSelectInstrs(DT, Switch.getSelectInsts());
+    if (!Switch.getSelectInsts().empty())
+      MadeChanges = true;
+
+    AllSwitchPaths SwitchPaths(&Switch, ORE);
+    SwitchPaths.run();
+
+    if (SwitchPaths.getNumThreadingPaths() > 0) {
+      ThreadableLoops.push_back(SwitchPaths);
+
+      // For the time being limit this optimization to occurring once in a
+      // function since it can change the CFG significantly. This is not a
+      // strict requirement but it can cause buggy behavior if there is an
+      // overlap of blocks in different opportunities. There is a lot of room to
+      // experiment with catching more opportunities here.
+      break;
+    }
+  }
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  if (ThreadableLoops.size() > 0)
+    CodeMetrics::collectEphemeralValues(&F, AC, EphValues);
+
+  for (AllSwitchPaths SwitchPaths : ThreadableLoops) {
+    TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues);
+    Transform.run();
+    MadeChanges = true;
+  }
+
+#ifdef EXPENSIVE_CHECKS
+  assert(DT->verify(DominatorTree::VerificationLevel::Full));
+  verifyFunction(F, &dbgs());
+#endif
+
+  return MadeChanges;
+}
+
+} // end anonymous namespace
+
+/// Integrate with the new Pass Manager
+PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  OptimizationRemarkEmitter ORE(&F);
+
+  if (!DFAJumpThreading(&AC, &DT, &TTI, &ORE).run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
Index: llvm/lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/Scalar.cpp
+++ llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -60,6 +60,7 @@
   initializeInferAddressSpacesPass(Registry);
   initializeInstSimplifyLegacyPassPass(Registry);
   initializeJumpThreadingPass(Registry);
+  initializeDFAJumpThreadingLegacyPassPass(Registry);
   initializeLegacyLICMPassPass(Registry);
   initializeLegacyLoopSinkPassPass(Registry);
   initializeLoopFuseLegacyPass(Registry);
Index: llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -dfa-jump-threading -sccp -simplifycfg %s | FileCheck %s
+
+; This test checks that a constant propagation is applied for a basic loop.
+; Related to bug 44679.
+define i32 @test(i32 %a) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 3
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %num = phi i32 [ 0, %entry ], [ %add, %case1 ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %case1 ]
+  switch i32 %state, label %end [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  %state.next = phi i32 [ 3, %case2 ], [ 2, %while.cond ]
+  %add = add nsw i32 %num, %state
+  br label %while.cond
+
+case2:
+  br label %case1
+
+end:
+  ret i32 %num
+}
Index: llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
@@ -0,0 +1,180 @@
+; REQUIRES: asserts
+; RUN: opt -S -dfa-jump-threading -debug-only=dfa-jump-threading -disable-output %s 2>&1 | FileCheck %s
+
+; This test checks that the analysis identifies all threadable paths in a
+; simple CFG. A threadable path includes a list of basic blocks, the exit
+; state, and the block that determines the next state.
+; < path of BBs that form a cycle > [ state, determinator ]
+define i32 @test1(i32 %num) {
+; CHECK: < for.body for.inc > [ 1, for.inc ]
+; CHECK-NEXT: < for.body case1 for.inc > [ 2, for.inc ]
+; CHECK-NEXT: < for.body case2 for.inc > [ 1, for.inc ]
+; CHECK-NEXT: < for.body case2 si.unfold.false for.inc > [ 2, for.inc ]
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+    i32 1, label %case1
+    i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+; This test checks that the analysis finds threadable paths in a more
+; complicated CFG. Here the FSM is represented as a nested loop, with
+; fallthrough cases.
+define i32 @test2(i32 %init) {
+; CHECK: < loop.3 case2 > [ 3, loop.3 ]
+; CHECK-NEXT: < loop.3 case2 loop.1.backedge loop.1 loop.2 > [ 1, loop.1 ]
+; CHECK-NEXT: < loop.3 case2 loop.1.backedge si.unfold.false loop.1 loop.2 > [ 4, loop.1.backedge ]
+; CHECK-NEXT: < loop.3 case3 loop.2.backedge loop.2 > [ 0, loop.2.backedge ]
+; CHECK-NEXT: < loop.3 case3 case4 loop.2.backedge loop.2 > [ 3, loop.2.backedge ]
+; CHECK-NEXT: < loop.3 case3 case4 loop.1.backedge loop.1 loop.2 > [ 1, loop.1 ]
+; CHECK-NEXT: < loop.3 case3 case4 loop.1.backedge si.unfold.false loop.1 loop.2 > [ 2, loop.1.backedge ]
+; CHECK-NEXT: < loop.3 case4 loop.2.backedge loop.2 > [ 3, loop.2.backedge ]
+; CHECK-NEXT: < loop.3 case4 loop.1.backedge loop.1 loop.2 > [ 1, loop.1 ]
+; CHECK-NEXT: < loop.3 case4 loop.1.backedge si.unfold.false loop.1 loop.2 > [ 2, loop.1.backedge ]
+entry:
+  %cmp = icmp eq i32 %init, 0
+  %sel = select i1 %cmp, i32 0, i32 2
+  br label %loop.1
+
+loop.1:
+  %state.1 = phi i32 [ %sel, %entry ], [ %state.1.be2, %loop.1.backedge ]
+  br label %loop.2
+
+loop.2:
+  %state.2 = phi i32 [ %state.1, %loop.1 ], [ %state.2.be, %loop.2.backedge ]
+  br label %loop.3
+
+loop.3:
+  %state = phi i32 [ %state.2, %loop.2 ], [ 3, %case2 ]
+  switch i32 %state, label %infloop.i [
+    i32 2, label %case2
+    i32 3, label %case3
+    i32 4, label %case4
+    i32 0, label %case0
+    i32 1, label %case1
+  ]
+
+case2:
+  br i1 %cmp, label %loop.3, label %loop.1.backedge
+
+case3:
+  br i1 %cmp, label %loop.2.backedge, label %case4
+
+case4:
+  br i1 %cmp, label %loop.2.backedge, label %loop.1.backedge
+
+loop.1.backedge:
+  %state.1.be = phi i32 [ 2, %case4 ], [ 4, %case2 ]
+  %state.1.be2 = select i1 %cmp, i32 1, i32 %state.1.be
+  br label %loop.1
+
+loop.2.backedge:
+  %state.2.be = phi i32 [ 3, %case4 ], [ 0, %case3 ]
+  br label %loop.2
+
+case0:
+  br label %exit
+
+case1:
+  br label %exit
+
+infloop.i:
+  br label %infloop.i
+
+exit:
+  ret i32 0
+}
+
+declare void @baz()
+
+; Verify that having the switch block as a determinator is handled correctly.
+define i32 @main() {
+; CHECK: < bb43 bb59 bb3 bb31 bb41 > [ 77, bb43 ]
+; CHECK-NEXT: < bb43 bb49 bb59 bb3 bb31 bb41 > [ 77, bb43 ]
+bb:
+  %i = alloca [420 x i8], align 1
+  %i2 = getelementptr inbounds [420 x i8], [420 x i8]* %i, i64 0, i64 390
+  br label %bb3
+
+bb3:                                              ; preds = %bb59, %bb
+  %i4 = phi i8* [ %i2, %bb ], [ %i60, %bb59 ]
+  %i5 = phi i8 [ 77, %bb ], [ %i64, %bb59 ]
+  %i6 = phi i32 [ 2, %bb ], [ %i63, %bb59 ]
+  %i7 = phi i32 [ 26, %bb ], [ %i62, %bb59 ]
+  %i8 = phi i32 [ 25, %bb ], [ %i61, %bb59 ]
+  %i9 = icmp sgt i32 %i7, 2
+  %i10 = select i1 %i9, i32 %i7, i32 2
+  %i11 = add i32 %i8, 2
+  %i12 = sub i32 %i11, %i10
+  %i13 = mul nsw i32 %i12, 3
+  %i14 = add nsw i32 %i13, %i6
+  %i15 = sext i32 %i14 to i64
+  %i16 = getelementptr inbounds i8, i8* %i4, i64 %i15
+  %i17 = load i8, i8* %i16, align 1
+  %i18 = icmp sgt i8 %i17, 0
+  br i1 %i18, label %bb21, label %bb31
+
+bb21:                                             ; preds = %bb3
+  br i1 true, label %bb59, label %bb43
+
+bb59:                                             ; preds = %bb49, %bb43, %bb31, %bb21
+  %i60 = phi i8* [ %i44, %bb49 ], [ %i44, %bb43 ], [ %i34, %bb31 ], [ %i4, %bb21 ]
+  %i61 = phi i32 [ %i45, %bb49 ], [ %i45, %bb43 ], [ %i33, %bb31 ], [ %i8, %bb21 ]
+  %i62 = phi i32 [ %i47, %bb49 ], [ %i47, %bb43 ], [ %i32, %bb31 ], [ %i7, %bb21 ]
+  %i63 = phi i32 [ %i48, %bb49 ], [ %i48, %bb43 ], [ 2, %bb31 ], [ %i6, %bb21 ]
+  %i64 = phi i8 [ %i46, %bb49 ], [ %i46, %bb43 ], [ 77, %bb31 ], [ %i5, %bb21 ]
+  %i65 = icmp sgt i32 %i62, 0
+  br i1 %i65, label %bb3, label %bb66
+
+bb31:                                             ; preds = %bb3
+  %i32 = add nsw i32 %i7, -1
+  %i33 = add nsw i32 %i8, -1
+  %i34 = getelementptr inbounds i8, i8* %i4, i64 -15
+  %i35 = icmp eq i8 %i5, 77
+  br i1 %i35, label %bb59, label %bb41
+
+bb41:                                             ; preds = %bb31
+  tail call void @baz()
+  br label %bb43
+
+bb43:                                             ; preds = %bb41, %bb21
+  %i44 = phi i8* [ %i34, %bb41 ], [ %i4, %bb21 ]
+  %i45 = phi i32 [ %i33, %bb41 ], [ %i8, %bb21 ]
+  %i46 = phi i8 [ 77, %bb41 ], [ %i5, %bb21 ]
+  %i47 = phi i32 [ %i32, %bb41 ], [ %i7, %bb21 ]
+  %i48 = phi i32 [ 2, %bb41 ], [ %i6, %bb21 ]
+  tail call void @baz()
+  switch i8 %i5, label %bb59 [
+    i8 68, label %bb49
+    i8 73, label %bb49
+  ]
+
+bb49:                                             ; preds = %bb43, %bb43
+  tail call void @baz()
+  br label %bb59
+
+bb66:                                             ; preds = %bb59
+  ret i32 0
+}
Index: llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -dfa-jump-threading %s | FileCheck %s
+
+; These tests check that the DFA jump threading transformation is applied
+; properly to two CFGs. It checks that blocks are cloned, branches are updated,
+; and SSA form is restored.
+define i32 @test1(i32 %num) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.jt2:
+; CHECK-NEXT:    [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ]
+; CHECK-NEXT:    br label [[CASE2]]
+; CHECK:       for.body.jt1:
+; CHECK-NEXT:    [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    br label [[CASE1]]
+; CHECK:       case1:
+; CHECK-NEXT:    [[COUNT2:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       case2:
+; CHECK-NEXT:    [[COUNT1:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[COUNT1]], 50
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 undef, 1
+; CHECK-NEXT:    [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.inc.jt2:
+; CHECK-NEXT:    [[COUNT4:%.*]] = phi i32 [ [[COUNT1]], [[SI_UNFOLD_FALSE]] ], [ [[COUNT2]], [[CASE1]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT2]] = phi i32 [ 2, [[CASE1]] ], [ 2, [[SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[INC_JT2]] = add nsw i32 [[COUNT4]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]]
+; CHECK:       for.inc.jt1:
+; CHECK-NEXT:    [[COUNT3:%.*]] = phi i32 [ [[COUNT1]], [[CASE2]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT1]] = phi i32 [ 1, [[CASE2]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC_JT1]] = add nsw i32 [[COUNT3]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+
+define i32 @test2(i32 %init) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INIT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_1:%.*]], label [[SI_UNFOLD_FALSE1:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br label [[LOOP_1]]
+; CHECK:       si.unfold.false.jt2:
+; CHECK-NEXT:    br label [[LOOP_1_JT2:%.*]]
+; CHECK:       si.unfold.false.jt4:
+; CHECK-NEXT:    br label [[LOOP_1_JT4:%.*]]
+; CHECK:       si.unfold.false1:
+; CHECK-NEXT:    br label [[LOOP_1]]
+; CHECK:       loop.1:
+; CHECK-NEXT:    [[STATE_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ undef, [[SI_UNFOLD_FALSE:%.*]] ], [ 2, [[SI_UNFOLD_FALSE1]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop.1.jt2:
+; CHECK-NEXT:    [[STATE_1_JT2:%.*]] = phi i32 [ [[STATE_1_BE_JT2:%.*]], [[SI_UNFOLD_FALSE_JT2:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2_JT2:%.*]]
+; CHECK:       loop.1.jt4:
+; CHECK-NEXT:    [[STATE_1_JT4:%.*]] = phi i32 [ [[STATE_1_BE_JT4:%.*]], [[SI_UNFOLD_FALSE_JT4:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2_JT4:%.*]]
+; CHECK:       loop.1.jt1:
+; CHECK-NEXT:    [[STATE_1_JT1:%.*]] = phi i32 [ 1, [[LOOP_1_BACKEDGE:%.*]] ], [ 1, [[LOOP_1_BACKEDGE_JT4:%.*]] ], [ 1, [[LOOP_1_BACKEDGE_JT2:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2_JT1:%.*]]
+; CHECK:       loop.2:
+; CHECK-NEXT:    [[STATE_2:%.*]] = phi i32 [ [[STATE_1]], [[LOOP_1]] ], [ undef, [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop.2.jt2:
+; CHECK-NEXT:    [[STATE_2_JT2:%.*]] = phi i32 [ [[STATE_1_JT2]], [[LOOP_1_JT2]] ]
+; CHECK-NEXT:    br label [[LOOP_3_JT2:%.*]]
+; CHECK:       loop.2.jt3:
+; CHECK-NEXT:    [[STATE_2_JT3:%.*]] = phi i32 [ [[STATE_2_BE_JT3:%.*]], [[LOOP_2_BACKEDGE_JT3:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3_JT3:%.*]]
+; CHECK:       loop.2.jt0:
+; CHECK-NEXT:    [[STATE_2_JT0:%.*]] = phi i32 [ [[STATE_2_BE_JT0:%.*]], [[LOOP_2_BACKEDGE_JT0:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3_JT0:%.*]]
+; CHECK:       loop.2.jt4:
+; CHECK-NEXT:    [[STATE_2_JT4:%.*]] = phi i32 [ [[STATE_1_JT4]], [[LOOP_1_JT4]] ]
+; CHECK-NEXT:    br label [[LOOP_3_JT4:%.*]]
+; CHECK:       loop.2.jt1:
+; CHECK-NEXT:    [[STATE_2_JT1:%.*]] = phi i32 [ [[STATE_1_JT1]], [[LOOP_1_JT1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3_JT1:%.*]]
+; CHECK:       loop.3:
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ [[STATE_2]], [[LOOP_2]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[INFLOOP_I:%.*]] [
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    i32 3, label [[CASE3:%.*]]
+; CHECK-NEXT:    i32 4, label [[CASE4:%.*]]
+; CHECK-NEXT:    i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       loop.3.jt2:
+; CHECK-NEXT:    [[STATE_JT2:%.*]] = phi i32 [ [[STATE_2_JT2]], [[LOOP_2_JT2]] ]
+; CHECK-NEXT:    br label [[CASE2]]
+; CHECK:       loop.3.jt0:
+; CHECK-NEXT:    [[STATE_JT0:%.*]] = phi i32 [ [[STATE_2_JT0]], [[LOOP_2_JT0:%.*]] ]
+; CHECK-NEXT:    br label [[CASE0]]
+; CHECK:       loop.3.jt4:
+; CHECK-NEXT:    [[STATE_JT4:%.*]] = phi i32 [ [[STATE_2_JT4]], [[LOOP_2_JT4]] ]
+; CHECK-NEXT:    br label [[CASE4]]
+; CHECK:       loop.3.jt1:
+; CHECK-NEXT:    [[STATE_JT1:%.*]] = phi i32 [ [[STATE_2_JT1]], [[LOOP_2_JT1]] ]
+; CHECK-NEXT:    br label [[CASE1]]
+; CHECK:       loop.3.jt3:
+; CHECK-NEXT:    [[STATE_JT3:%.*]] = phi i32 [ 3, [[CASE2]] ], [ [[STATE_2_JT3]], [[LOOP_2_JT3:%.*]] ]
+; CHECK-NEXT:    br label [[CASE3]]
+; CHECK:       case2:
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_3_JT3]], label [[LOOP_1_BACKEDGE_JT4]]
+; CHECK:       case3:
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_2_BACKEDGE_JT0]], label [[CASE4]]
+; CHECK:       case4:
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_2_BACKEDGE_JT3]], label [[LOOP_1_BACKEDGE_JT2]]
+; CHECK:       loop.1.backedge:
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_1_JT1]], label [[SI_UNFOLD_FALSE]]
+; CHECK:       loop.1.backedge.jt2:
+; CHECK-NEXT:    [[STATE_1_BE_JT2]] = phi i32 [ 2, [[CASE4]] ]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_1_JT1]], label [[SI_UNFOLD_FALSE_JT2]]
+; CHECK:       loop.1.backedge.jt4:
+; CHECK-NEXT:    [[STATE_1_BE_JT4]] = phi i32 [ 4, [[CASE2]] ]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_1_JT1]], label [[SI_UNFOLD_FALSE_JT4]]
+; CHECK:       loop.2.backedge:
+; CHECK-NEXT:    br label [[LOOP_2]]
+; CHECK:       loop.2.backedge.jt3:
+; CHECK-NEXT:    [[STATE_2_BE_JT3]] = phi i32 [ 3, [[CASE4]] ]
+; CHECK-NEXT:    br label [[LOOP_2_JT3]]
+; CHECK:       loop.2.backedge.jt0:
+; CHECK-NEXT:    [[STATE_2_BE_JT0]] = phi i32 [ 0, [[CASE3]] ]
+; CHECK-NEXT:    br label [[LOOP_2_JT0]]
+; CHECK:       case0:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       infloop.i:
+; CHECK-NEXT:    br label [[INFLOOP_I]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cmp = icmp eq i32 %init, 0
+  %sel = select i1 %cmp, i32 0, i32 2
+  br label %loop.1
+
+loop.1:
+  %state.1 = phi i32 [ %sel, %entry ], [ %state.1.be2, %loop.1.backedge ]
+  br label %loop.2
+
+loop.2:
+  %state.2 = phi i32 [ %state.1, %loop.1 ], [ %state.2.be, %loop.2.backedge ]
+  br label %loop.3
+
+loop.3:
+  %state = phi i32 [ %state.2, %loop.2 ], [ 3, %case2 ]
+  switch i32 %state, label %infloop.i [
+  i32 2, label %case2
+  i32 3, label %case3
+  i32 4, label %case4
+  i32 0, label %case0
+  i32 1, label %case1
+  ]
+
+case2:
+  br i1 %cmp, label %loop.3, label %loop.1.backedge
+
+case3:
+  br i1 %cmp, label %loop.2.backedge, label %case4
+
+case4:
+  br i1 %cmp, label %loop.2.backedge, label %loop.1.backedge
+
+loop.1.backedge:
+  %state.1.be = phi i32 [ 2, %case4 ], [ 4, %case2 ]
+  %state.1.be2 = select i1 %cmp, i32 1, i32 %state.1.be
+  br label %loop.1
+
+loop.2.backedge:
+  %state.2.be = phi i32 [ 3, %case4 ], [ 0, %case3 ]
+  br label %loop.2
+
+case0:
+  br label %exit
+
+case1:
+  br label %exit
+
+infloop.i:
+  br label %infloop.i
+
+exit:
+  ret i32 0
+}
Index: llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -dfa-jump-threading %s | FileCheck %s
+
+; These tests check if selects are unfolded properly for jump threading
+; opportunities. There are three different patterns to consider:
+; 1) Both operands are constant and the false branch is unfolded by default
+; 2) One operand is constant and the other is another select to be unfolded. In
+;    this case a single select is sunk to a new block to unfold.
+; 3) Both operands are a select, and both should be sunk to new blocks.
+define i32 @test1(i32 %num) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.jt2:
+; CHECK-NEXT:    [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ]
+; CHECK-NEXT:    br label [[CASE2]]
+; CHECK:       for.body.jt1:
+; CHECK-NEXT:    [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    br label [[CASE1]]
+; CHECK:       case1:
+; CHECK-NEXT:    [[COUNT2:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       case2:
+; CHECK-NEXT:    [[COUNT1:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[COUNT1]], 50
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 undef, 1
+; CHECK-NEXT:    [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.inc.jt2:
+; CHECK-NEXT:    [[COUNT4:%.*]] = phi i32 [ [[COUNT1]], [[SI_UNFOLD_FALSE]] ], [ [[COUNT2]], [[CASE1]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT2]] = phi i32 [ 2, [[CASE1]] ], [ 2, [[SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[INC_JT2]] = add nsw i32 [[COUNT4]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]]
+; CHECK:       for.inc.jt1:
+; CHECK-NEXT:    [[COUNT3:%.*]] = phi i32 [ [[COUNT1]], [[CASE2]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT1]] = phi i32 [ 1, [[CASE2]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC_JT1]] = add nsw i32 [[COUNT3]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp slt i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+define i32 @test2(i32 %num) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.jt3:
+; CHECK-NEXT:    [[COUNT_JT3:%.*]] = phi i32 [ [[INC_JT3:%.*]], [[FOR_INC_JT3:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT3:%.*]] = phi i32 [ [[STATE_NEXT_JT3:%.*]], [[FOR_INC_JT3]] ]
+; CHECK-NEXT:    br label [[FOR_INC_JT1]]
+; CHECK:       for.body.jt2:
+; CHECK-NEXT:    [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ]
+; CHECK-NEXT:    br label [[CASE2]]
+; CHECK:       for.body.jt1:
+; CHECK-NEXT:    [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    br label [[CASE1]]
+; CHECK:       case1:
+; CHECK-NEXT:    [[COUNT5:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP_C1:%.*]] = icmp slt i32 [[COUNT5]], 50
+; CHECK-NEXT:    [[CMP2_C1:%.*]] = icmp slt i32 [[COUNT5]], 100
+; CHECK-NEXT:    br i1 [[CMP2_C1]], label [[SI_UNFOLD_TRUE:%.*]], label [[FOR_INC_JT3]]
+; CHECK:       case2:
+; CHECK-NEXT:    [[COUNT3:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP_C2:%.*]] = icmp slt i32 [[COUNT3]], 50
+; CHECK-NEXT:    [[CMP2_C2:%.*]] = icmp sgt i32 [[COUNT3]], 100
+; CHECK-NEXT:    br i1 [[CMP2_C2]], label [[FOR_INC_JT3]], label [[SI_UNFOLD_FALSE:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br i1 [[CMP_C2]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE1:%.*]]
+; CHECK:       si.unfold.false1:
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP_C1]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE2:%.*]]
+; CHECK:       si.unfold.false2:
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 undef, 1
+; CHECK-NEXT:    [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.inc.jt3:
+; CHECK-NEXT:    [[COUNT6:%.*]] = phi i32 [ [[COUNT3]], [[CASE2]] ], [ [[COUNT5]], [[CASE1]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT3]] = phi i32 [ 3, [[CASE1]] ], [ 3, [[CASE2]] ]
+; CHECK-NEXT:    [[INC_JT3]] = add nsw i32 [[COUNT6]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT3:%.*]] = icmp slt i32 [[INC_JT3]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT3]], label [[FOR_BODY_JT3:%.*]], label [[FOR_END]]
+; CHECK:       for.inc.jt2:
+; CHECK-NEXT:    [[COUNT7:%.*]] = phi i32 [ [[COUNT3]], [[SI_UNFOLD_FALSE1]] ], [ [[COUNT5]], [[SI_UNFOLD_FALSE2]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT2]] = phi i32 [ 2, [[SI_UNFOLD_FALSE1]] ], [ 2, [[SI_UNFOLD_FALSE2]] ]
+; CHECK-NEXT:    [[INC_JT2]] = add nsw i32 [[COUNT7]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]]
+; CHECK:       for.inc.jt1:
+; CHECK-NEXT:    [[COUNT4:%.*]] = phi i32 [ [[COUNT_JT3]], [[FOR_BODY_JT3]] ], [ [[COUNT3]], [[SI_UNFOLD_FALSE]] ], [ [[COUNT5]], [[SI_UNFOLD_TRUE]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT1]] = phi i32 [ 1, [[FOR_BODY]] ], [ 1, [[SI_UNFOLD_FALSE]] ], [ 1, [[SI_UNFOLD_TRUE]] ], [ 1, [[FOR_BODY_JT3]] ]
+; CHECK-NEXT:    [[INC_JT1]] = add nsw i32 [[COUNT4]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  %cmp.c1 = icmp slt i32 %count, 50
+  %cmp2.c1 = icmp slt i32 %count, 100
+  %state1.1 = select i1 %cmp.c1, i32 1, i32 2
+  %state1.2 = select i1 %cmp2.c1, i32 %state1.1, i32 3
+  br label %for.inc
+
+case2:
+  %cmp.c2 = icmp slt i32 %count, 50
+  %cmp2.c2 = icmp sgt i32 %count, 100
+  %state2.1 = select i1 %cmp.c2, i32 1, i32 2
+  %state2.2 = select i1 %cmp2.c2, i32 3, i32 %state2.1
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %state1.2, %case1 ], [ %state2.2, %case2 ], [ 1, %for.body ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+define i32 @test3(i32 %num) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ undef, [[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.jt4:
+; CHECK-NEXT:    [[COUNT_JT4:%.*]] = phi i32 [ [[INC_JT4:%.*]], [[FOR_INC_JT4:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT4:%.*]] = phi i32 [ [[STATE_NEXT_JT4:%.*]], [[FOR_INC_JT4]] ]
+; CHECK-NEXT:    br label [[FOR_INC_JT1]]
+; CHECK:       for.body.jt3:
+; CHECK-NEXT:    [[COUNT_JT3:%.*]] = phi i32 [ [[INC_JT3:%.*]], [[FOR_INC_JT3:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT3:%.*]] = phi i32 [ [[STATE_NEXT_JT3:%.*]], [[FOR_INC_JT3]] ]
+; CHECK-NEXT:    br label [[FOR_INC_JT1]]
+; CHECK:       for.body.jt2:
+; CHECK-NEXT:    [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ]
+; CHECK-NEXT:    br label [[CASE2]]
+; CHECK:       for.body.jt1:
+; CHECK-NEXT:    [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    br label [[CASE1]]
+; CHECK:       case1:
+; CHECK-NEXT:    [[COUNT5:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       case2:
+; CHECK-NEXT:    [[COUNT4:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp slt i32 [[COUNT4]], 50
+; CHECK-NEXT:    [[CMP_2:%.*]] = icmp slt i32 [[COUNT4]], 100
+; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[COUNT4]], 1
+; CHECK-NEXT:    [[CMP_3:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP_3]], label [[SI_UNFOLD_TRUE:%.*]], label [[SI_UNFOLD_FALSE:%.*]]
+; CHECK:       si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP_1]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE2:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[FOR_INC_JT3]], label [[SI_UNFOLD_FALSE1:%.*]]
+; CHECK:       si.unfold.false1:
+; CHECK-NEXT:    br label [[FOR_INC_JT4]]
+; CHECK:       si.unfold.false2:
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 undef, 1
+; CHECK-NEXT:    [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.inc.jt4:
+; CHECK-NEXT:    [[STATE_NEXT_JT4]] = phi i32 [ 4, [[SI_UNFOLD_FALSE1]] ]
+; CHECK-NEXT:    [[INC_JT4]] = add nsw i32 [[COUNT4]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT4:%.*]] = icmp slt i32 [[INC_JT4]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT4]], label [[FOR_BODY_JT4:%.*]], label [[FOR_END]]
+; CHECK:       for.inc.jt3:
+; CHECK-NEXT:    [[STATE_NEXT_JT3]] = phi i32 [ 3, [[SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[INC_JT3]] = add nsw i32 [[COUNT4]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT3:%.*]] = icmp slt i32 [[INC_JT3]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT3]], label [[FOR_BODY_JT3:%.*]], label [[FOR_END]]
+; CHECK:       for.inc.jt2:
+; CHECK-NEXT:    [[COUNT6:%.*]] = phi i32 [ [[COUNT4]], [[SI_UNFOLD_FALSE2]] ], [ [[COUNT5]], [[CASE1]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT2]] = phi i32 [ 2, [[CASE1]] ], [ 2, [[SI_UNFOLD_FALSE2]] ]
+; CHECK-NEXT:    [[INC_JT2]] = add nsw i32 [[COUNT6]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]]
+; CHECK:       for.inc.jt1:
+; CHECK-NEXT:    [[COUNT3:%.*]] = phi i32 [ [[COUNT_JT4]], [[FOR_BODY_JT4]] ], [ [[COUNT_JT3]], [[FOR_BODY_JT3]] ], [ [[COUNT4]], [[SI_UNFOLD_TRUE]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT1]] = phi i32 [ 1, [[FOR_BODY]] ], [ 1, [[SI_UNFOLD_TRUE]] ], [ 1, [[FOR_BODY_JT3]] ], [ 1, [[FOR_BODY_JT4]] ]
+; CHECK-NEXT:    [[INC_JT1]] = add nsw i32 [[COUNT3]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp.1 = icmp slt i32 %count, 50
+  %cmp.2 = icmp slt i32 %count, 100
+  %0 = and i32 %count, 1
+  %cmp.3 = icmp eq i32 %0, 0
+  %sel.1 = select i1 %cmp.1, i32 1, i32 2
+  %sel.2 = select i1 %cmp.2, i32 3, i32 4
+  %sel.3 = select i1 %cmp.3, i32 %sel.1, i32 %sel.2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel.3, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
Index: llvm/test/Transforms/DFAJumpThreading/max-path-length.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DFAJumpThreading/max-path-length.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -dfa-jump-threading -dfa-max-path-length=6 %s | FileCheck %s
+
+; Make the path
+;   <%for.body %case1 %case1.1 %case1.2 %case1.3 %case1.4 %for.inc %for.end>
+; too long so that it is not jump-threaded.
+define i32 @max_path_length(i32 %num) {
+; CHECK-LABEL: @max_path_length(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[STATE_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.jt2:
+; CHECK-NEXT:    [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ]
+; CHECK-NEXT:    [[STATE_JT2:%.*]] = phi i32 [ [[STATE_NEXT_JT2:%.*]], [[FOR_INC_JT2]] ]
+; CHECK-NEXT:    br label [[CASE2]]
+; CHECK:       for.body.jt1:
+; CHECK-NEXT:    [[COUNT_JT1:%.*]] = phi i32 [ [[INC_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    [[STATE_JT1:%.*]] = phi i32 [ [[STATE_NEXT_JT1:%.*]], [[FOR_INC_JT1]] ]
+; CHECK-NEXT:    br label [[CASE1]]
+; CHECK:       case1:
+; CHECK-NEXT:    [[COUNT2:%.*]] = phi i32 [ [[COUNT_JT1]], [[FOR_BODY_JT1:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[CASE1_1:%.*]]
+; CHECK:       case1.1:
+; CHECK-NEXT:    br label [[CASE1_2:%.*]]
+; CHECK:       case1.2:
+; CHECK-NEXT:    br label [[CASE1_3:%.*]]
+; CHECK:       case1.3:
+; CHECK-NEXT:    br label [[CASE1_4:%.*]]
+; CHECK:       case1.4:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       case2:
+; CHECK-NEXT:    [[COUNT1:%.*]] = phi i32 [ [[COUNT_JT2]], [[FOR_BODY_JT2:%.*]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[COUNT1]], 50
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_INC_JT1]], label [[SI_UNFOLD_FALSE:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br label [[FOR_INC_JT2]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STATE_NEXT]] = phi i32 [ 2, [[CASE1_4]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[COUNT2]], 1
+; CHECK-NEXT:    [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.inc.jt2:
+; CHECK-NEXT:    [[STATE_NEXT_JT2]] = phi i32 [ 2, [[SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[INC_JT2]] = add nsw i32 [[COUNT1]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]]
+; CHECK:       for.inc.jt1:
+; CHECK-NEXT:    [[COUNT3:%.*]] = phi i32 [ [[COUNT1]], [[CASE2]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[STATE_NEXT_JT1]] = phi i32 [ 1, [[CASE2]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC_JT1]] = add nsw i32 [[COUNT3]], 1
+; CHECK-NEXT:    [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %case1.1
+
+case1.1:
+  br label %case1.2
+
+case1.2:
+  br label %case1.3
+
+case1.3:
+  br label %case1.4
+
+case1.4:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1.4 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
Index: llvm/test/Transforms/DFAJumpThreading/negative.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DFAJumpThreading/negative.ll
@@ -0,0 +1,216 @@
+; RUN: opt -dfa-jump-threading -dfa-cost-threshold=25 -pass-remarks-missed='dfa-jump-threading' -pass-remarks-output=%t -disable-output %s
+; RUN: FileCheck --input-file %t --check-prefix=REMARK %s
+; RUN: opt -S -dfa-jump-threading %s | FileCheck %s
+
+; This negative test case checks that the optimization doesn't trigger
+; when the code size cost is too high.
+define i32 @negative1(i32 %num) {
+; REMARK: NotProfitable
+; REMARK-NEXT: negative1
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  %add1 = add i32 %num, %num
+  %add2 = add i32 %add1, %add1
+  %add3 = add i32 %add2, %add2
+  %add4 = add i32 %add3, %add3
+  %add5 = add i32 %add4, %add4
+  %add6 = add i32 %add5, %add5
+  %add7 = add i32 %add6, %add6
+  %add8 = add i32 %add7, %add7
+  %add9 = add i32 %add8, %add8
+  %add10 = add i32 %add9, %add9
+  %add11 = add i32 %add10, %add10
+  %add12 = add i32 %add11, %add11
+  %add13 = add i32 %add12, %add12
+  %add14 = add i32 %add13, %add13
+  %add15 = add i32 %add14, %add14
+  %add16 = add i32 %add15, %add15
+  %add17 = add i32 %add16, %add16
+  %add18 = add i32 %add17, %add17
+  %add19 = add i32 %add18, %add18
+  %add20 = add i32 %add19, %add19
+  %add21 = add i32 %add20, %add20
+  %add22 = add i32 %add21, %add21
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 %add22
+}
+
+declare void @func()
+
+define i32 @negative2(i32 %num) {
+; REMARK: NonDuplicatableInst
+; REMARK-NEXT: negative2
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  call void @func() noduplicate
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+define i32 @negative3(i32 %num) {
+; REMARK: ConvergentInst
+; REMARK-NEXT: negative3
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  call void @func() convergent
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+define i32 @negative4(i32 %num) {
+; REMARK: SwitchNotPredictable
+; REMARK-NEXT: negative4
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  ; the switch variable is not predictable since the exit value for %case1
+  ; is defined through a non-instruction (function argument).
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ %num, %case1 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+; Do not optimize if marked minsize.
+define i32 @negative5(i32 %num) minsize {
+; CHECK-LABEL: @negative5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[STATE_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC]] [
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       case2:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[COUNT]], 50
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 1, i32 2
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STATE_NEXT]] = phi i32 [ [[SEL]], [[CASE2]] ], [ 1, [[FOR_BODY]] ], [ 2, [[CASE1]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[COUNT]], 1
+; CHECK-NEXT:    [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
+  switch i32 %state, label %for.inc [
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case1:
+  br label %for.inc
+
+case2:
+  %cmp = icmp eq i32 %count, 50
+  %sel = select i1 %cmp, i32 1, i32 2
+  br label %for.inc
+
+for.inc:
+  %state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ 2, %case1 ]
+  %inc = add nsw i32 %count, 1
+  %cmp.exit = icmp slt i32 %inc, %num
+  br i1 %cmp.exit, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}