Index: llvm/trunk/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h
===================================================================
--- llvm/trunk/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h
+++ llvm/trunk/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h
@@ -0,0 +1,111 @@
+//===- SpeculateAroundPHIs.h - Speculate around PHIs ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H
+#define LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/Compiler.h"
+#include <vector>
+
+namespace llvm {
+
+/// This pass handles simple speculating of  instructions around PHIs when
+/// doing so is profitable for a particular target despite duplicated
+/// instructions.
+///
+/// The motivating example are PHIs of constants which will require
+/// materializing the constants along each edge. If the PHI is used by an
+/// instruction where the target can materialize the constant as part of the
+/// instruction, it is profitable to speculate those instructions around the
+/// PHI node. This can reduce dynamic instruction count as well as decrease
+/// register pressure.
+///
+/// Consider this IR for example:
+///   ```
+///   entry:
+///     br i1 %flag, label %a, label %b
+///
+///   a:
+///     br label %exit
+///
+///   b:
+///     br label %exit
+///
+///   exit:
+///     %p = phi i32 [ 7, %a ], [ 11, %b ]
+///     %sum = add i32 %arg, %p
+///     ret i32 %sum
+///   ```
+/// To materialize the inputs to this PHI node may require an explicit
+/// instruction. For example, on x86 this would turn into something like
+///   ```
+///     testq %eax, %eax
+///     movl $7, %rNN
+///     jne .L
+///     movl $11, %rNN
+///   .L:
+///     addl %edi, %rNN
+///     movl %rNN, %eax
+///     retq
+///   ```
+/// When these constants can be folded directly into another instruction, it
+/// would be preferable to avoid the potential for register pressure (above we
+/// can easily avoid it, but that isn't always true) and simply duplicate the
+/// instruction using the PHI:
+///   ```
+///   entry:
+///     br i1 %flag, label %a, label %b
+///
+///   a:
+///     %sum.1 = add i32 %arg, 7
+///     br label %exit
+///
+///   b:
+///     %sum.2 = add i32 %arg, 11
+///     br label %exit
+///
+///   exit:
+///     %p = phi i32 [ %sum.1, %a ], [ %sum.2, %b ]
+///     ret i32 %p
+///   ```
+/// Which will generate something like the following on x86:
+///   ```
+///     testq %eax, %eax
+///     addl $7, %edi
+///     jne .L
+///     addl $11, %edi
+///   .L:
+///     movl %edi, %eax
+///     retq
+///   ```
+///
+/// It is important to note that this pass is never intended to handle more
+/// complex cases where speculating around PHIs allows simplifications of the
+/// IR itself or other subsequent optimizations. Those can and should already
+/// be handled before this pass is ever run by a more powerful analysis that
+/// can reason about equivalences and common subexpressions. Classically, those
+/// cases would be handled by a GVN-powered PRE or similar transform. This
+/// pass, in contrast, is *only* interested in cases where despite no
+/// simplifications to the IR itself, speculation is *faster* to execute. The
+/// result of this is that the cost models which are appropriate to consider
+/// here are relatively simple ones around execution and codesize cost, without
+/// any need to consider simplifications or other transformations.
+struct SpeculateAroundPHIsPass : PassInfoMixin<SpeculateAroundPHIsPass> {
+  /// \brief Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H
Index: llvm/trunk/lib/Passes/PassBuilder.cpp
===================================================================
--- llvm/trunk/lib/Passes/PassBuilder.cpp
+++ llvm/trunk/lib/Passes/PassBuilder.cpp
@@ -132,6 +132,7 @@
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Scalar/Sink.h"
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
@@ -799,6 +800,11 @@
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
   OptimizePM.addPass(SimplifyCFGPass());
 
+  // Optimize PHIs by speculating around them when profitable. Note that this
+  // pass needs to be run after any PRE or similar pass as it is essentially
+  // inserting redudnancies into the progrem. This even includes SimplifyCFG.
+  OptimizePM.addPass(SpeculateAroundPHIsPass());
+
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
Index: llvm/trunk/lib/Passes/PassRegistry.def
===================================================================
--- llvm/trunk/lib/Passes/PassRegistry.def
+++ llvm/trunk/lib/Passes/PassRegistry.def
@@ -199,6 +199,7 @@
 FUNCTION_PASS("sink", SinkingPass())
 FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
 FUNCTION_PASS("speculative-execution", SpeculativeExecutionPass())
+FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass())
 FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
Index: llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
+++ llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
@@ -62,6 +62,7 @@
   SimplifyCFGPass.cpp
   Sink.cpp
   SpeculativeExecution.cpp
+  SpeculateAroundPHIs.cpp
   StraightLineStrengthReduce.cpp
   StructurizeCFG.cpp
   TailRecursionElimination.cpp
Index: llvm/trunk/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ llvm/trunk/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -0,0 +1,811 @@
+//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spec-phis"
+
+STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
+STATISTIC(NumEdgesSplit,
+          "Number of critical edges which were split for speculation");
+STATISTIC(NumSpeculatedInstructions,
+          "Number of instructions we speculated around the PHI nodes");
+STATISTIC(NumNewRedundantInstructions,
+          "Number of new, redundant instructions inserted");
+
+/// Check wether speculating the users of a PHI node around the PHI
+/// will be safe.
+///
+/// This checks both that all of the users are safe and also that all of their
+/// operands are either recursively safe or already available along an incoming
+/// edge to the PHI.
+///
+/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
+/// and the chain of nodes that definitively reach any unsafe node in
+/// `UnsafeSet`. By preserving these between repeated calls to this routine for
+/// PHIs in the same basic block, the exploration here can be reused. However,
+/// these caches must no be reused for PHIs in a different basic block as they
+/// reflect what is available along incoming edges.
+static bool
+isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
+                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                          SmallPtrSetImpl<Instruction *> &UnsafeSet) {
+  auto *PhiBB = PN.getParent();
+  SmallPtrSet<Instruction *, 4> Visited;
+  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+
+  // Walk each user of the PHI node.
+  for (Use &U : PN.uses()) {
+    auto *UI = cast<Instruction>(U.getUser());
+
+    // Ensure the use post-dominates the PHI node. This ensures that, in the
+    // absence of unwinding, the use will actually be reached.
+    // FIXME: We use a blunt hammer of requiring them to be in the same basic
+    // block. We should consider using actual post-dominance here in the
+    // future.
+    if (UI->getParent() != PhiBB) {
+      DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n");
+      return false;
+    }
+
+    // FIXME: This check is much too conservative. We're not going to move these
+    // instructions onto new dynamic paths through the program unless there is
+    // a call instruction between the use and the PHI node. And memory isn't
+    // changing unless there is a store in that same sequence. We should
+    // probably change this to do at least a limited scan of the intervening
+    // instructions and allow handling stores in easily proven safe cases.
+    if (mayBeMemoryDependent(*UI)) {
+      DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n");
+      return false;
+    }
+
+    // Now do a depth-first search of everything these users depend on to make
+    // sure they are transitively safe. This is a depth-first search, but we
+    // check nodes in preorder to minimize the amount of checking.
+    Visited.insert(UI);
+    DFSStack.push_back({UI, UI->value_op_begin()});
+    do {
+      User::value_op_iterator OpIt;
+      std::tie(UI, OpIt) = DFSStack.pop_back_val();
+
+      while (OpIt != UI->value_op_end()) {
+        auto *OpI = dyn_cast<Instruction>(*OpIt);
+        // Increment to the next operand for whenever we continue.
+        ++OpIt;
+        // No need to visit non-instructions, which can't form dependencies.
+        if (!OpI)
+          continue;
+
+        // Now do the main pre-order checks that this operand is a viable
+        // dependency of something we want to speculate.
+
+        // First do a few checks for instructions that won't require
+        // speculation at all because they are trivially available on the
+        // incoming edge (either through dominance or through an incoming value
+        // to a PHI).
+        //
+        // The cases in the current block will be trivially dominated by the
+        // edge.
+        auto *ParentBB = OpI->getParent();
+        if (ParentBB == PhiBB) {
+          if (isa<PHINode>(OpI)) {
+            // We can trivially map through phi nodes in the same block.
+            continue;
+          }
+        } else if (DT.dominates(ParentBB, PhiBB)) {
+          // Instructions from dominating blocks are already available.
+          continue;
+        }
+
+        // Once we know that we're considering speculating the operand, check
+        // if we've already explored this subgraph and found it to be safe.
+        if (PotentialSpecSet.count(OpI))
+          continue;
+
+        // If we've already explored this subgraph and found it unsafe, bail.
+        // If when we directly test whether this is safe it fails, bail.
+        if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
+            mayBeMemoryDependent(*OpI)) {
+          DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: " << *OpI
+                       << "\n");
+          // Record the stack of instructions which reach this node as unsafe
+          // so we prune subsequent searches.
+          UnsafeSet.insert(OpI);
+          for (auto &StackPair : DFSStack) {
+            Instruction *I = StackPair.first;
+            UnsafeSet.insert(I);
+          }
+          return false;
+        }
+
+        // Skip any operands we're already recursively checking.
+        if (!Visited.insert(OpI).second)
+          continue;
+
+        // Push onto the stack and descend. We can directly continue this
+        // loop when ascending.
+        DFSStack.push_back({UI, OpIt});
+        UI = OpI;
+        OpIt = OpI->value_op_begin();
+      }
+
+      // This node and all its operands are safe. Go ahead and cache that for
+      // reuse later.
+      PotentialSpecSet.insert(UI);
+
+      // Continue with the next node on the stack.
+    } while (!DFSStack.empty());
+  }
+
+#ifndef NDEBUG
+  // Every visited operand should have been marked as safe for speculation at
+  // this point. Verify this and return success.
+  for (auto *I : Visited)
+    assert(PotentialSpecSet.count(I) &&
+           "Failed to mark a visited instruction as safe!");
+#endif
+  return true;
+}
+
+/// Check whether, in isolation, a given PHI node is both safe and profitable
+/// to speculate users around.
+///
+/// This handles checking whether there are any constant operands to a PHI
+/// which could represent a useful speculation candidate, whether the users of
+/// the PHI are safe to speculate including all their transitive dependencies,
+/// and whether after speculation there will be some cost savings (profit) to
+/// folding the operands into the users of the PHI node. Returns true if both
+/// safe and profitable with relevant cost savings updated in the map and with
+/// an update to the `PotentialSpecSet`. Returns false if either safety or
+/// profitability are absent. Some new entries may be made to the
+/// `PotentialSpecSet` even when this routine returns false, but they remain
+/// conservatively correct.
+///
+/// The profitability check here is a local one, but it checks this in an
+/// interesting way. Beyond checking that the total cost of materializing the
+/// constants will be less than the cost of folding them into their users, it
+/// also checks that no one incoming constant will have a higher cost when
+/// folded into its users rather than materialized. This higher cost could
+/// result in a dynamic *path* that is more expensive even when the total cost
+/// is lower. Currently, all of the interesting cases where this optimization
+/// should fire are ones where it is a no-loss operation in this sense. If we
+/// ever want to be more aggressive here, we would need to balance the
+/// different incoming edges' cost by looking at their respective
+/// probabilities.
+static bool isSafeAndProfitableToSpeculateAroundPHI(
+    PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+    SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+    SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
+    TargetTransformInfo &TTI) {
+  // First see whether there is any cost savings to speculating around this
+  // PHI, and build up a map of the constant inputs to how many times they
+  // occur.
+  bool NonFreeMat = false;
+  struct CostsAndCount {
+    int MatCost = TargetTransformInfo::TCC_Free;
+    int FoldedCost = TargetTransformInfo::TCC_Free;
+    int Count = 0;
+  };
+  SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
+  SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
+  for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
+    auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
+    if (!IncomingC)
+      continue;
+
+    // Only visit each incoming edge with a constant input once.
+    if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
+      continue;
+
+    auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
+    // Count how many edges share a given incoming costant.
+    ++InsertResult.first->second.Count;
+    // Only compute the cost the first time we see a particular constant.
+    if (!InsertResult.second)
+      continue;
+
+    int &MatCost = InsertResult.first->second.MatCost;
+    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
+    NonFreeMat |= MatCost != TTI.TCC_Free;
+  }
+  if (!NonFreeMat) {
+    DEBUG(dbgs() << "    Free: " << PN << "\n");
+    // No profit in free materialization.
+    return false;
+  }
+
+  // Now check that the uses of this PHI can actually be speculated,
+  // otherwise we'll still have to materialize the PHI value.
+  if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
+    DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n");
+    return false;
+  }
+
+  // Compute how much (if any) savings are available by speculating around this
+  // PHI.
+  for (Use &U : PN.uses()) {
+    auto *UserI = cast<Instruction>(U.getUser());
+    // Now check whether there is any savings to folding the incoming constants
+    // into this use.
+    unsigned Idx = U.getOperandNo();
+
+    // If we have a binary operator that is commutative, an actual constant
+    // operand would end up on the RHS, so pretend the use of the PHI is on the
+    // RHS.
+    //
+    // Technically, this is a bit weird if *both* operands are PHIs we're
+    // speculating. But if that is the case, giving an "optimistic" cost isn't
+    // a bad thing because after speculation it will constant fold. And
+    // moreover, such cases should likely have been constant folded already by
+    // some other pass, so we shouldn't worry about "modeling" them terribly
+    // accurately here. Similarly, if the other operand is a constant, it still
+    // seems fine to be "optimistic" in our cost modeling, because when the
+    // incoming operand from the PHI node is also a constant, we will end up
+    // constant folding.
+    if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
+      // Assume we will commute the constant to the RHS to be canonical.
+      Idx = 1;
+
+    // Get the intrinsic ID if this user is an instrinsic.
+    Intrinsic::ID IID = Intrinsic::not_intrinsic;
+    if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
+      IID = UserII->getIntrinsicID();
+
+    for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
+      ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
+      int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+      int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+      if (IID)
+        FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
+                                        IncomingC->getType());
+      else
+        FoldedCost +=
+            TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
+                              IncomingC->getType());
+
+      // If we accumulate more folded cost for this incoming constant than
+      // materialized cost, then we'll regress any edge with this constant so
+      // just bail. We're only interested in cases where folding the incoming
+      // constants is at least break-even on all paths.
+      if (FoldedCost > MatCost) {
+        DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC << "\n"
+                        "    Materializing cost:    " << MatCost << "\n"
+                        "    Accumulated folded cost: " << FoldedCost << "\n");
+        return false;
+      }
+    }
+  }
+
+  // Compute the total cost savings afforded by this PHI node.
+  int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
+  for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
+    int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+    int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+    int Count = IncomingConstantAndCostsAndCount.second.Count;
+
+    TotalMatCost += MatCost * Count;
+    TotalFoldedCost += FoldedCost * Count;
+  }
+  assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
+                                            "less that its materialized cost, "
+                                            "the sum must be as well.");
+
+  DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost)
+               << ": " << PN << "\n");
+  CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
+  return true;
+}
+
+/// Simple helper to walk all the users of a list of phis depth first, and call
+/// a visit function on each one in post-order.
+///
+/// All of the PHIs should be in the same basic block, and this is primarily
+/// used to make a single depth-first walk across their collective users
+/// without revisiting any subgraphs. Callers should provide a fast, idempotent
+/// callable to test whether a node has been visited and the more important
+/// callable to actually visit a particular node.
+///
+/// Depth-first and postorder here refer to the *operand* graph -- we start
+/// from a collection of users of PHI nodes and walk "up" the operands
+/// depth-first.
+template <typename IsVisitedT, typename VisitT>
+static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
+                                            IsVisitedT IsVisited,
+                                            VisitT Visit) {
+  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+  for (auto *PN : PNs)
+    for (Use &U : PN->uses()) {
+      auto *UI = cast<Instruction>(U.getUser());
+      if (IsVisited(UI))
+        // Already visited this user, continue across the roots.
+        continue;
+
+      // Otherwise, walk the operand graph depth-first and visit each
+      // dependency in postorder.
+      DFSStack.push_back({UI, UI->value_op_begin()});
+      do {
+        User::value_op_iterator OpIt;
+        std::tie(UI, OpIt) = DFSStack.pop_back_val();
+        while (OpIt != UI->value_op_end()) {
+          auto *OpI = dyn_cast<Instruction>(*OpIt);
+          // Increment to the next operand for whenever we continue.
+          ++OpIt;
+          // No need to visit non-instructions, which can't form dependencies,
+          // or instructions outside of our potential dependency set that we
+          // were given. Finally, if we've already visited the node, continue
+          // to the next.
+          if (!OpI || IsVisited(OpI))
+            continue;
+
+          // Push onto the stack and descend. We can directly continue this
+          // loop when ascending.
+          DFSStack.push_back({UI, OpIt});
+          UI = OpI;
+          OpIt = OpI->value_op_begin();
+        }
+
+        // Finished visiting children, visit this node.
+        assert(!IsVisited(UI) && "Should not have already visited a node!");
+        Visit(UI);
+      } while (!DFSStack.empty());
+    }
+}
+
+/// Find profitable PHIs to speculate.
+///
+/// For a PHI node to be profitable, we need the cost of speculating its users
+/// (and their dependencies) to not exceed the savings of folding the PHI's
+/// constant operands into the speculated users.
+///
+/// Computing this is surprisingly challenging. Because users of two different
+/// PHI nodes can depend on each other or on common other instructions, it may
+/// be profitable to speculate two PHI nodes together even though neither one
+/// in isolation is profitable. The straightforward way to find all the
+/// profitable PHIs would be to check each combination of PHIs' cost, but this
+/// is exponential in complexity.
+///
+/// Even if we assume that we only care about cases where we can consider each
+/// PHI node in isolation (rather than considering cases where none are
+/// profitable in isolation but some subset are profitable as a set), we still
+/// have a challenge. The obvious way to find all individually profitable PHIs
+/// is to iterate until reaching a fixed point, but this will be quadratic in
+/// complexity. =/
+///
+/// This code currently uses a linear-to-compute order for a greedy approach.
+/// It won't find cases where a set of PHIs must be considered together, but it
+/// handles most cases of order dependence without quadratic iteration. The
+/// specific order used is the post-order across the operand DAG. When the last
+/// user of a PHI is visited in this postorder walk, we check it for
+/// profitability.
+///
+/// There is an orthogonal extra complexity to all of this: computing the cost
+/// itself can easily become a linear computation making everything again (at
+/// best) quadratic. Using a postorder over the operand graph makes it
+/// particularly easy to avoid this through dynamic programming. As we do the
+/// postorder walk, we build the transitive cost of that subgraph. It is also
+/// straightforward to then update these costs when we mark a PHI for
+/// speculation so that subsequent PHIs don't re-pay the cost of already
+/// speculated instructions.
+static SmallVector<PHINode *, 16>
+findProfitablePHIs(ArrayRef<PHINode *> PNs,
+                   const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+                   const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                   int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
+  SmallVector<PHINode *, 16> SpecPNs;
+
+  // First, establish a reverse mapping from immediate users of the PHI nodes
+  // to the nodes themselves, and count how many users each PHI node has in
+  // a way we can update while processing them.
+  SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
+  SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
+  SmallPtrSet<Instruction *, 16> UserSet;
+  for (auto *PN : PNs) {
+    assert(UserSet.empty() && "Must start with an empty user set!");
+    for (Use &U : PN->uses())
+      UserSet.insert(cast<Instruction>(U.getUser()));
+    PNUserCountMap[PN] = UserSet.size();
+    for (auto *UI : UserSet)
+      UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
+    UserSet.clear();
+  }
+
+  // Now do a DFS across the operand graph of the users, computing cost as we
+  // go and when all costs for a given PHI are known, checking that PHI for
+  // profitability.
+  SmallDenseMap<Instruction *, int, 16> SpecCostMap;
+  visitPHIUsersAndDepsInPostOrder(
+      PNs,
+      /*IsVisited*/
+      [&](Instruction *I) {
+        // We consider anything that isn't potentially speculated to be
+        // "visited" as it is already handled. Similarly, anything that *is*
+        // potentially speculated but for which we have an entry in our cost
+        // map, we're done.
+        return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
+      },
+      /*Visit*/
+      [&](Instruction *I) {
+        // We've fully visited the operands, so sum their cost with this node
+        // and update the cost map.
+        int Cost = TTI.TCC_Free;
+        for (Value *OpV : I->operand_values())
+          if (auto *OpI = dyn_cast<Instruction>(OpV)) {
+            auto CostMapIt = SpecCostMap.find(OpI);
+            if (CostMapIt != SpecCostMap.end())
+              Cost += CostMapIt->second;
+          }
+        Cost += TTI.getUserCost(I);
+        bool Inserted = SpecCostMap.insert({I, Cost}).second;
+        (void)Inserted;
+        assert(Inserted && "Must not re-insert a cost during the DFS!");
+
+        // Now check if this node had a corresponding PHI node using it. If so,
+        // we need to decrement the outstanding user count for it.
+        auto UserPNsIt = UserToPNMap.find(I);
+        if (UserPNsIt == UserToPNMap.end())
+          return;
+        auto &UserPNs = UserPNsIt->second;
+        auto UserPNsSplitIt = std::stable_partition(
+            UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
+              int &PNUserCount = PNUserCountMap.find(UserPN)->second;
+              assert(
+                  PNUserCount > 0 &&
+                  "Should never re-visit a PN after its user count hits zero!");
+              --PNUserCount;
+              return PNUserCount != 0;
+            });
+
+        // FIXME: Rather than one at a time, we should sum the savings as the
+        // cost will be completely shared.
+        SmallVector<Instruction *, 16> SpecWorklist;
+        for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
+          int SpecCost = TTI.TCC_Free;
+          for (Use &U : PN->uses())
+            SpecCost +=
+                SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
+          SpecCost *= (NumPreds - 1);
+          // When the user count of a PHI node hits zero, we should check its
+          // profitability. If profitable, we should mark it for speculation
+          // and zero out the cost of everything it depends on.
+          int CostSavings = CostSavingsMap.find(PN)->second;
+          if (SpecCost > CostSavings) {
+            DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN << "\n"
+                            "    Cost savings:     " << CostSavings << "\n"
+                            "    Speculation cost: " << SpecCost << "\n");
+            continue;
+          }
+
+          // We're going to speculate this user-associated PHI. Copy it out and
+          // add its users to the worklist to update their cost.
+          SpecPNs.push_back(PN);
+          for (Use &U : PN->uses()) {
+            auto *UI = cast<Instruction>(U.getUser());
+            auto CostMapIt = SpecCostMap.find(UI);
+            if (CostMapIt->second == 0)
+              continue;
+            // Zero out this cost entry to avoid duplicates.
+            CostMapIt->second = 0;
+            SpecWorklist.push_back(UI);
+          }
+        }
+
+        // Now walk all the operands of the users in the worklist transitively
+        // to zero out all the memoized costs.
+        while (!SpecWorklist.empty()) {
+          Instruction *SpecI = SpecWorklist.pop_back_val();
+          assert(SpecCostMap.find(SpecI)->second == 0 &&
+                 "Didn't zero out a cost!");
+
+          // Walk the operands recursively to zero out their cost as well.
+          for (auto *OpV : SpecI->operand_values()) {
+            auto *OpI = dyn_cast<Instruction>(OpV);
+            if (!OpI)
+              continue;
+            auto CostMapIt = SpecCostMap.find(OpI);
+            if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
+              continue;
+            CostMapIt->second = 0;
+            SpecWorklist.push_back(OpI);
+          }
+        }
+      });
+
+  return SpecPNs;
+}
+
+/// Speculate users around a set of PHI nodes.
+///
+/// This routine does the actual speculation around a set of PHI nodes where we
+/// have determined this to be both safe and profitable.
+///
+/// This routine handles any spliting of critical edges necessary to create
+/// a safe block to speculate into as well as cloning the instructions and
+/// rewriting all uses.
+static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
+                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                          SmallSetVector<BasicBlock *, 16> &PredSet,
+                          DominatorTree &DT) {
+  DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n");
+  NumPHIsSpeculated += SpecPNs.size();
+
+  // Split any critical edges so that we have a block to hoist into.
+  auto *ParentBB = SpecPNs[0]->getParent();
+  SmallVector<BasicBlock *, 16> SpecPreds;
+  SpecPreds.reserve(PredSet.size());
+  for (auto *PredBB : PredSet) {
+    auto *NewPredBB = SplitCriticalEdge(
+        PredBB, ParentBB,
+        CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
+    if (NewPredBB) {
+      ++NumEdgesSplit;
+      DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName()
+                   << "\n");
+      SpecPreds.push_back(NewPredBB);
+    } else {
+      assert(PredBB->getSingleSuccessor() == ParentBB &&
+             "We need a non-critical predecessor to speculate into.");
+      assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
+             "Cannot have a non-critical invoke!");
+
+      // Already non-critical, use existing pred.
+      SpecPreds.push_back(PredBB);
+    }
+  }
+
+  SmallPtrSet<Instruction *, 16> SpecSet;
+  SmallVector<Instruction *, 16> SpecList;
+  visitPHIUsersAndDepsInPostOrder(SpecPNs,
+                                  /*IsVisited*/
+                                  [&](Instruction *I) {
+                                    // This is visited if we don't need to
+                                    // speculate it or we already have
+                                    // speculated it.
+                                    return !PotentialSpecSet.count(I) ||
+                                           SpecSet.count(I);
+                                  },
+                                  /*Visit*/
+                                  [&](Instruction *I) {
+                                    // All operands scheduled, schedule this
+                                    // node.
+                                    SpecSet.insert(I);
+                                    SpecList.push_back(I);
+                                  });
+
+  int NumSpecInsts = SpecList.size() * SpecPreds.size();
+  int NumRedundantInsts = NumSpecInsts - SpecList.size();
+  DEBUG(dbgs() << "  Inserting " << NumSpecInsts << " speculated instructions, "
+               << NumRedundantInsts << " redundancies\n");
+  NumSpeculatedInstructions += NumSpecInsts;
+  NumNewRedundantInstructions += NumRedundantInsts;
+
+  // Each predecessor is numbered by its index in `SpecPreds`, so for each
+  // instruction we speculate, the speculated instruction is stored in that
+  // index of the vector asosciated with the original instruction. We also
+  // store the incoming values for each predecessor from any PHIs used.
+  SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
+
+  // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
+  // value. This handles both the PHIs we are speculating around and any other
+  // PHIs that happen to be used.
+  for (auto *OrigI : SpecList)
+    for (auto *OpV : OrigI->operand_values()) {
+      auto *OpPN = dyn_cast<PHINode>(OpV);
+      if (!OpPN || OpPN->getParent() != ParentBB)
+        continue;
+
+      auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
+      if (!InsertResult.second)
+        continue;
+
+      auto &SpeculatedVals = InsertResult.first->second;
+
+      // Populating our structure for mapping is particularly annoying because
+      // finding an incoming value for a particular predecessor block in a PHI
+      // node is a linear time operation! To avoid quadratic behavior, we build
+      // a map for this PHI node's incoming values and then translate it into
+      // the more compact representation used below.
+      SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
+      for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
+        IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
+
+      for (auto *PredBB : SpecPreds)
+        SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
+    }
+
+  // Speculate into each predecessor.
+  for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
+    auto *PredBB = SpecPreds[PredIdx];
+    assert(PredBB->getSingleSuccessor() == ParentBB &&
+           "We need a non-critical predecessor to speculate into.");
+
+    for (auto *OrigI : SpecList) {
+      auto *NewI = OrigI->clone();
+      NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
+      NewI->insertBefore(PredBB->getTerminator());
+
+      // Rewrite all the operands to the previously speculated instructions.
+      // Because we're walking in-order, the defs must precede the uses and we
+      // should already have these mappings.
+      for (Use &U : NewI->operands()) {
+        auto *OpI = dyn_cast<Instruction>(U.get());
+        if (!OpI)
+          continue;
+        auto MapIt = SpeculatedValueMap.find(OpI);
+        if (MapIt == SpeculatedValueMap.end())
+          continue;
+        const auto &SpeculatedVals = MapIt->second;
+        assert(SpeculatedVals[PredIdx] &&
+               "Must have a speculated value for this predecessor!");
+        assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
+               "Speculated value has the wrong type!");
+
+        // Rewrite the use to this predecessor's speculated instruction.
+        U.set(SpeculatedVals[PredIdx]);
+      }
+
+      // Commute instructions which now have a constant in the LHS but not the
+      // RHS.
+      if (NewI->isBinaryOp() && NewI->isCommutative() &&
+          isa<Constant>(NewI->getOperand(0)) &&
+          !isa<Constant>(NewI->getOperand(1)))
+        NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
+
+      SpeculatedValueMap[OrigI].push_back(NewI);
+      assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
+             "Mismatched speculated instruction index!");
+    }
+  }
+
+  // Walk the speculated instruction list and if they have uses, insert a PHI
+  // for them from the speculated versions, and replace the uses with the PHI.
+  // Then erase the instructions as they have been fully speculated. The walk
+  // needs to be in reverse so that we don't think there are users when we'll
+  // actually eventually remove them later.
+  IRBuilder<> IRB(SpecPNs[0]);
+  for (auto *OrigI : llvm::reverse(SpecList)) {
+    // Check if we need a PHI for any remaining users and if so, insert it.
+    if (!OrigI->use_empty()) {
+      auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
+                                    Twine(OrigI->getName()) + ".phi");
+      // Add the incoming values we speculated.
+      auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
+      for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
+        SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
+
+      // And replace the uses with the PHI node.
+      OrigI->replaceAllUsesWith(SpecIPN);
+    }
+
+    // It is important to immediately erase this so that it stops using other
+    // instructions. This avoids inserting needless PHIs of them.
+    OrigI->eraseFromParent();
+  }
+
+  // All of the uses of the speculated phi nodes should be removed at this
+  // point, so erase them.
+  for (auto *SpecPN : SpecPNs) {
+    assert(SpecPN->use_empty() && "All users should have been speculated!");
+    SpecPN->eraseFromParent();
+  }
+}
+
+/// Try to speculate around a series of PHIs from a single basic block.
+///
+/// This routine checks whether any of these PHIs are profitable to speculate
+/// users around. If safe and profitable, it does the speculation. It returns
+/// true when at least some speculation occurs.
+static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
+                               DominatorTree &DT, TargetTransformInfo &TTI) {
+  DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+
+  // Savings in cost from speculating around a PHI node.
+  SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
+
+  // Remember the set of instructions that are candidates for speculation so
+  // that we can quickly walk things within that space. This prunes out
+  // instructions already available along edges, etc.
+  SmallPtrSet<Instruction *, 16> PotentialSpecSet;
+
+  // Remember the set of instructions that are (transitively) unsafe to
+  // speculate into the incoming edges of this basic block. This avoids
+  // recomputing them for each PHI node we check. This set is specific to this
+  // block though as things are pruned out of it based on what is available
+  // along incoming edges.
+  SmallPtrSet<Instruction *, 16> UnsafeSet;
+
+  // For each PHI node in this block, check whether there are immediate folding
+  // opportunities from speculation, and whether that speculation will be
+  // valid. This determise the set of safe PHIs to speculate.
+  PNs.erase(llvm::remove_if(PNs,
+                            [&](PHINode *PN) {
+                              return !isSafeAndProfitableToSpeculateAroundPHI(
+                                  *PN, CostSavingsMap, PotentialSpecSet,
+                                  UnsafeSet, DT, TTI);
+                            }),
+            PNs.end());
+  // If no PHIs were profitable, skip.
+  if (PNs.empty()) {
+    DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
+    return false;
+  }
+
+  // We need to know how much speculation will cost which is determined by how
+  // many incoming edges will need a copy of each speculated instruction.
+  SmallSetVector<BasicBlock *, 16> PredSet;
+  for (auto *PredBB : PNs[0]->blocks()) {
+    if (!PredSet.insert(PredBB))
+      continue;
+
+    // We cannot speculate when a predecessor is an indirect branch.
+    // FIXME: We also can't reliably create a non-critical edge block for
+    // speculation if the predecessor is an invoke. This doesn't seem
+    // fundamental and we should probably be splitting critical edges
+    // differently.
+    if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
+        isa<InvokeInst>(PredBB->getTerminator())) {
+      DEBUG(dbgs() << "  Invalid: predecessor terminator: " << PredBB->getName()
+                   << "\n");
+      return false;
+    }
+  }
+  if (PredSet.size() < 2) {
+    DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n");
+    return false;
+  }
+
+  SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
+      PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
+  if (SpecPNs.empty())
+    // Nothing to do.
+    return false;
+
+  speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
+  return true;
+}
+
+PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = false;
+  for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
+    SmallVector<PHINode *, 16> PNs;
+    auto BBI = BB->begin();
+    while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
+      PNs.push_back(PN);
+      ++BBI;
+    }
+
+    if (PNs.empty())
+      continue;
+
+    Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
Index: llvm/trunk/test/Other/new-pm-defaults.ll
===================================================================
--- llvm/trunk/test/Other/new-pm-defaults.ll
+++ llvm/trunk/test/Other/new-pm-defaults.ll
@@ -210,6 +210,7 @@
 ; CHECK-O-NEXT: Running pass: InstSimplifierPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: SpeculateAroundPHIsPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: ConstantMergePass
Index: llvm/trunk/test/Other/new-pm-thinlto-defaults.ll
===================================================================
--- llvm/trunk/test/Other/new-pm-thinlto-defaults.ll
+++ llvm/trunk/test/Other/new-pm-thinlto-defaults.ll
@@ -198,6 +198,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-POSTLINK-O-NEXT: Running pass: SpeculateAroundPHIsPass
 ; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass
Index: llvm/trunk/test/Transforms/SpeculateAroundPHIs/basic-x86.ll
===================================================================
--- llvm/trunk/test/Transforms/SpeculateAroundPHIs/basic-x86.ll
+++ llvm/trunk/test/Transforms/SpeculateAroundPHIs/basic-x86.ll
@@ -0,0 +1,595 @@
+; Test the basic functionality of speculating around PHI nodes based on reduced
+; cost of the constant operands to the PHI nodes using the x86 cost model.
+;
+; REQUIRES: x86-registered-target
+; RUN: opt -S -passes=spec-phis < %s | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+
+define i32 @test_basic(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_basic(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %arg, 7
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %arg, 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %sum = add i32 %arg, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+; Check that we handle commuted operands and get the constant onto the RHS.
+define i32 @test_commuted(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_commuted(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %arg, 7
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %arg, 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %sum = add i32 %p, %arg
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+define i32 @test_split_crit_edge(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_split_crit_edge(
+entry:
+  br i1 %flag, label %exit, label %a
+; CHECK:       entry:
+; CHECK-NEXT:    br i1 %flag, label %[[ENTRY_SPLIT:.*]], label %a
+;
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    %[[SUM_ENTRY_SPLIT:.*]] = add i32 %arg, 7
+; CHECK-NEXT:    br label %exit
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %arg, 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %entry ], [ 11, %a ]
+  %sum = add i32 %arg, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_ENTRY_SPLIT]], %[[ENTRY_SPLIT]] ], [ %[[SUM_A]], %a ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+define i32 @test_no_spec_dominating_inst(i1 %flag, i32* %ptr) {
+; CHECK-LABEL: define i32 @test_no_spec_dominating_inst(
+entry:
+  %load = load i32, i32* %ptr
+  br i1 %flag, label %a, label %b
+; CHECK:         %[[LOAD:.*]] = load i32, i32* %ptr
+; CHECK-NEXT:    br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %[[LOAD]], 7
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %[[LOAD]], 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %sum = add i32 %load, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+; We have special logic handling PHI nodes, make sure it doesn't get confused
+; by a dominating PHI.
+define i32 @test_no_spec_dominating_phi(i1 %flag1, i1 %flag2, i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @test_no_spec_dominating_phi(
+entry:
+  br i1 %flag1, label %x.block, label %y.block
+; CHECK:       entry:
+; CHECK-NEXT:    br i1 %flag1, label %x.block, label %y.block
+
+x.block:
+  br label %merge
+; CHECK:       x.block:
+; CHECK-NEXT:    br label %merge
+
+y.block:
+  br label %merge
+; CHECK:       y.block:
+; CHECK-NEXT:    br label %merge
+
+merge:
+  %xy.phi = phi i32 [ %x, %x.block ], [ %y, %y.block ]
+  br i1 %flag2, label %a, label %b
+; CHECK:       merge:
+; CHECK-NEXT:    %[[XY_PHI:.*]] = phi i32 [ %x, %x.block ], [ %y, %y.block ]
+; CHECK-NEXT:    br i1 %flag2, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %[[XY_PHI]], 7
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %[[XY_PHI]], 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %sum = add i32 %xy.phi, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[SUM_PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; CHECK-NEXT:    ret i32 %[[SUM_PHI]]
+}
+
+; Ensure that we will speculate some number of "free" instructions on the given
+; architecture even though they are unrelated to the PHI itself.
+define i32 @test_speculate_free_insts(i1 %flag, i64 %arg) {
+; CHECK-LABEL: define i32 @test_speculate_free_insts(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[T1_A:.*]] = trunc i64 %arg to i48
+; CHECK-NEXT:    %[[T2_A:.*]] = trunc i48 %[[T1_A]] to i32
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %[[T2_A]], 7
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[T1_B:.*]] = trunc i64 %arg to i48
+; CHECK-NEXT:    %[[T2_B:.*]] = trunc i48 %[[T1_B]] to i32
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %[[T2_B]], 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %t1 = trunc i64 %arg to i48
+  %t2 = trunc i48 %t1 to i32
+  %sum = add i32 %t2, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+define i32 @test_speculate_free_phis(i1 %flag, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: define i32 @test_speculate_free_phis(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %arg1, 7
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %arg2, 11
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p1 = phi i32 [ 7, %a ], [ 11, %b ]
+  %p2 = phi i32 [ %arg1, %a ], [ %arg2, %b ]
+  %sum = add i32 %p2, %p1
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; We don't DCE the now unused PHI node...
+; CHECK-NEXT:    %{{.*}} = phi i32 [ %arg1, %a ], [ %arg2, %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+; We shouldn't speculate multiple uses even if each individually looks
+; profitable because of the total cost.
+define i32 @test_no_spec_multi_uses(i1 %flag, i32 %arg1, i32 %arg2, i32 %arg3) {
+; CHECK-LABEL: define i32 @test_no_spec_multi_uses(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %add1 = add i32 %arg1, %p
+  %add2 = add i32 %arg2, %p
+  %add3 = add i32 %arg3, %p
+  %sum1 = add i32 %add1, %add2
+  %sum2 = add i32 %sum1, %add3
+  ret i32 %sum2
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
+; CHECK-NEXT:    %[[ADD1:.*]] = add i32 %arg1, %[[PHI]]
+; CHECK-NEXT:    %[[ADD2:.*]] = add i32 %arg2, %[[PHI]]
+; CHECK-NEXT:    %[[ADD3:.*]] = add i32 %arg3, %[[PHI]]
+; CHECK-NEXT:    %[[SUM1:.*]] = add i32 %[[ADD1]], %[[ADD2]]
+; CHECK-NEXT:    %[[SUM2:.*]] = add i32 %[[SUM1]], %[[ADD3]]
+; CHECK-NEXT:    ret i32 %[[SUM2]]
+}
+
+define i32 @test_multi_phis1(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_multi_phis1(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A1:.*]] = add i32 %arg, 1
+; CHECK-NEXT:    %[[SUM_A2:.*]] = add i32 %[[SUM_A1]], 3
+; CHECK-NEXT:    %[[SUM_A3:.*]] = add i32 %[[SUM_A2]], 5
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B1:.*]] = add i32 %arg, 2
+; CHECK-NEXT:    %[[SUM_B2:.*]] = add i32 %[[SUM_B1]], 4
+; CHECK-NEXT:    %[[SUM_B3:.*]] = add i32 %[[SUM_B2]], 6
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p1 = phi i32 [ 1, %a ], [ 2, %b ]
+  %p2 = phi i32 [ 3, %a ], [ 4, %b ]
+  %p3 = phi i32 [ 5, %a ], [ 6, %b ]
+  %sum1 = add i32 %arg, %p1
+  %sum2 = add i32 %sum1, %p2
+  %sum3 = add i32 %sum2, %p3
+  ret i32 %sum3
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A3]], %a ], [ %[[SUM_B3]], %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+; Check that the order of the PHIs doesn't impact the behavior.
+define i32 @test_multi_phis2(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_multi_phis2(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:         br i1 %flag, label %a, label %b
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A1:.*]] = add i32 %arg, 1
+; CHECK-NEXT:    %[[SUM_A2:.*]] = add i32 %[[SUM_A1]], 3
+; CHECK-NEXT:    %[[SUM_A3:.*]] = add i32 %[[SUM_A2]], 5
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B1:.*]] = add i32 %arg, 2
+; CHECK-NEXT:    %[[SUM_B2:.*]] = add i32 %[[SUM_B1]], 4
+; CHECK-NEXT:    %[[SUM_B3:.*]] = add i32 %[[SUM_B2]], 6
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p3 = phi i32 [ 5, %a ], [ 6, %b ]
+  %p2 = phi i32 [ 3, %a ], [ 4, %b ]
+  %p1 = phi i32 [ 1, %a ], [ 2, %b ]
+  %sum1 = add i32 %arg, %p1
+  %sum2 = add i32 %sum1, %p2
+  %sum3 = add i32 %sum2, %p3
+  ret i32 %sum3
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ %[[SUM_A3]], %a ], [ %[[SUM_B3]], %b ]
+; CHECK-NEXT:    ret i32 %[[PHI]]
+}
+
+define i32 @test_no_spec_indirectbr(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_no_spec_indirectbr(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:       entry:
+; CHECK-NEXT:    br i1 %flag, label %a, label %b
+
+a:
+  indirectbr i8* undef, [label %exit]
+; CHECK:       a:
+; CHECK-NEXT:    indirectbr i8* undef, [label %exit]
+
+b:
+  indirectbr i8* undef, [label %exit]
+; CHECK:       b:
+; CHECK-NEXT:    indirectbr i8* undef, [label %exit]
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %sum = add i32 %arg, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
+; CHECK-NEXT:    %[[SUM:.*]] = add i32 %arg, %[[PHI]]
+; CHECK-NEXT:    ret i32 %[[SUM]]
+}
+
+declare void @g()
+
+declare i32 @__gxx_personality_v0(...)
+
+; FIXME: We should be able to handle this case -- only the exceptional edge is
+; impossible to split.
+define i32 @test_no_spec_invoke_continue(i1 %flag, i32 %arg) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: define i32 @test_no_spec_invoke_continue(
+entry:
+  br i1 %flag, label %a, label %b
+; CHECK:       entry:
+; CHECK-NEXT:    br i1 %flag, label %a, label %b
+
+a:
+  invoke void @g()
+          to label %exit unwind label %lpad
+; CHECK:       a:
+; CHECK-NEXT:    invoke void @g()
+; CHECK-NEXT:            to label %exit unwind label %lpad
+
+b:
+  invoke void @g()
+          to label %exit unwind label %lpad
+; CHECK:       b:
+; CHECK-NEXT:    invoke void @g()
+; CHECK-NEXT:            to label %exit unwind label %lpad
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %sum = add i32 %arg, %p
+  ret i32 %sum
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
+; CHECK-NEXT:    %[[SUM:.*]] = add i32 %arg, %[[PHI]]
+; CHECK-NEXT:    ret i32 %[[SUM]]
+
+lpad:
+  %lp = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+}
+
+define i32 @test_no_spec_landingpad(i32 %arg, i32* %ptr) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: define i32 @test_no_spec_landingpad(
+entry:
+  invoke void @g()
+          to label %invoke.cont unwind label %lpad
+; CHECK:       entry:
+; CHECK-NEXT:    invoke void @g()
+; CHECK-NEXT:            to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  invoke void @g()
+          to label %exit unwind label %lpad
+; CHECK:       invoke.cont:
+; CHECK-NEXT:    invoke void @g()
+; CHECK-NEXT:            to label %exit unwind label %lpad
+
+lpad:
+  %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
+  %lp = landingpad { i8*, i32 }
+          cleanup
+  %sum = add i32 %arg, %p
+  store i32 %sum, i32* %ptr
+  resume { i8*, i32 } undef
+; CHECK:       lpad:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
+
+exit:
+  ret i32 0
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+define i32 @test_no_spec_cleanuppad(i32 %arg, i32* %ptr) personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: define i32 @test_no_spec_cleanuppad(
+entry:
+  invoke void @g()
+          to label %invoke.cont unwind label %lpad
+; CHECK:       entry:
+; CHECK-NEXT:    invoke void @g()
+; CHECK-NEXT:            to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  invoke void @g()
+          to label %exit unwind label %lpad
+; CHECK:       invoke.cont:
+; CHECK-NEXT:    invoke void @g()
+; CHECK-NEXT:            to label %exit unwind label %lpad
+
+lpad:
+  %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
+  %cp = cleanuppad within none []
+  %sum = add i32 %arg, %p
+  store i32 %sum, i32* %ptr
+  cleanupret from %cp unwind to caller
+; CHECK:       lpad:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
+
+exit:
+  ret i32 0
+}
+
+; Check that we don't fall over when confronted with seemingly reasonable code
+; for us to handle but in an unreachable region and with non-PHI use-def
+; cycles.
+define i32 @test_unreachable_non_phi_cycles(i1 %flag, i32 %arg) {
+; CHECK-LABEL: define i32 @test_unreachable_non_phi_cycles(
+entry:
+  ret i32 42
+; CHECK:       entry:
+; CHECK-NEXT:    ret i32 42
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i32 [ 7, %a ], [ 11, %b ]
+  %zext = zext i32 %sum to i64
+  %trunc = trunc i64 %zext to i32
+  %sum = add i32 %trunc, %p
+  br i1 %flag, label %a, label %b
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
+; CHECK-NEXT:    %[[ZEXT:.*]] = zext i32 %[[SUM:.*]] to i64
+; CHECK-NEXT:    %[[TRUNC:.*]] = trunc i64 %[[ZEXT]] to i32
+; CHECK-NEXT:    %[[SUM]] = add i32 %[[TRUNC]], %[[PHI]]
+; CHECK-NEXT:    br i1 %flag, label %a, label %b
+}
+
+; Check that we don't speculate in the face of an expensive immediate. There
+; are two reasons this should never speculate. First, even a local analysis
+; should fail because it makes some paths (%a) potentially more expensive due
+; to multiple uses of the immediate. Additionally, when we go to speculate the
+; instructions, their cost will also be too high.
+; FIXME: The goal is really to test the first property, but there doesn't
+; happen to be any way to use free-to-speculate instructions here so that it
+; would be the only interesting property.
+define i64 @test_expensive_imm(i32 %flag, i64 %arg) {
+; CHECK-LABEL: define i64 @test_expensive_imm(
+entry:
+  switch i32 %flag, label %a [
+    i32 1, label %b
+    i32 2, label %c
+    i32 3, label %d
+  ]
+; CHECK:         switch i32 %flag, label %a [
+; CHECK-NEXT:      i32 1, label %b
+; CHECK-NEXT:      i32 2, label %c
+; CHECK-NEXT:      i32 3, label %d
+; CHECK-NEXT:    ]
+
+a:
+  br label %exit
+; CHECK:       a:
+; CHECK-NEXT:    br label %exit
+
+b:
+  br label %exit
+; CHECK:       b:
+; CHECK-NEXT:    br label %exit
+
+c:
+  br label %exit
+; CHECK:       c:
+; CHECK-NEXT:    br label %exit
+
+d:
+  br label %exit
+; CHECK:       d:
+; CHECK-NEXT:    br label %exit
+
+exit:
+  %p = phi i64 [ 4294967296, %a ], [ 1, %b ], [ 1, %c ], [ 1, %d ]
+  %sum1 = add i64 %arg, %p
+  %sum2 = add i64 %sum1, %p
+  ret i64 %sum2
+; CHECK:       exit:
+; CHECK-NEXT:    %[[PHI:.*]] = phi i64 [ {{[0-9]+}}, %a ], [ 1, %b ], [ 1, %c ], [ 1, %d ]
+; CHECK-NEXT:    %[[SUM1:.*]] = add i64 %arg, %[[PHI]]
+; CHECK-NEXT:    %[[SUM2:.*]] = add i64 %[[SUM1]], %[[PHI]]
+; CHECK-NEXT:    ret i64 %[[SUM2]]
+}
+
+define i32 @test_no_spec_non_postdominating_uses(i1 %flag1, i1 %flag2, i32 %arg) {
+; CHECK-LABEL: define i32 @test_no_spec_non_postdominating_uses(
+entry:
+  br i1 %flag1, label %a, label %b
+; CHECK:         br i1 %flag1, label %a, label %b
+
+a:
+  br label %merge
+; CHECK:       a:
+; CHECK-NEXT:    %[[SUM_A:.*]] = add i32 %arg, 7
+; CHECK-NEXT:    br label %merge
+
+b:
+  br label %merge
+; CHECK:       b:
+; CHECK-NEXT:    %[[SUM_B:.*]] = add i32 %arg, 11
+; CHECK-NEXT:    br label %merge
+
+merge:
+  %p1 = phi i32 [ 7, %a ], [ 11, %b ]
+  %p2 = phi i32 [ 13, %a ], [ 42, %b ]
+  %sum1 = add i32 %arg, %p1
+  br i1 %flag2, label %exit1, label %exit2
+; CHECK:       merge:
+; CHECK-NEXT:    %[[PHI1:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
+; CHECK-NEXT:    %[[PHI2:.*]] = phi i32 [ 13, %a ], [ 42, %b ]
+; CHECK-NEXT:    br i1 %flag2, label %exit1, label %exit2
+
+exit1:
+  ret i32 %sum1
+; CHECK:       exit1:
+; CHECK-NEXT:    ret i32 %[[PHI1]]
+
+exit2:
+  %sum2 = add i32 %arg, %p2
+  ret i32 %sum2
+; CHECK:       exit2:
+; CHECK-NEXT:    %[[SUM2:.*]] = add i32 %arg, %[[PHI2]]
+; CHECK-NEXT:    ret i32 %[[SUM2]]
+}