diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -143,7 +143,7 @@
 /// recursively. Return true if any instructions were deleted.
 bool RecursivelyDeleteTriviallyDeadInstructions(
     Value *V, const TargetLibraryInfo *TLI = nullptr,
-    MemorySSAUpdater *MSSAU = nullptr);
+    MemorySSAUpdater *MSSAU = nullptr, AssumptionCache *AC = nullptr);
 
 /// Delete all of the instructions in `DeadInsts`, and all other instructions
 /// that deleting these in turn causes to be trivially dead.
@@ -155,7 +155,8 @@
 /// empty afterward.
 void RecursivelyDeleteTriviallyDeadInstructions(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts,
-    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr);
+    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr,
+    AssumptionCache *AC = nullptr);
 
 /// Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow
 /// instructions that are not trivially dead. These will be ignored.
@@ -163,7 +164,8 @@
 /// were found and deleted.
 bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts,
-    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr);
+    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr,
+    AssumptionCache *AC = nullptr);
 
 /// If the specified value is an effectively dead PHI node, due to being a
 /// def-use chain of single-use nodes that either forms a cycle or is terminated
@@ -242,7 +244,8 @@
 /// branches to us and one of our successors, fold the setcc into the
 /// predecessor and use logical operations to pick the right destination.
 bool FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU = nullptr,
-                            unsigned BonusInstThreshold = 1);
+                            unsigned BonusInstThreshold = 1,
+                            AssumptionCache *AC = nullptr);
 
 /// This function takes a virtual register computed by an Instruction and
 /// replaces it with a slot in the stack frame, allocated via alloca.
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -7987,7 +7987,7 @@
                       << ToBeDeletedInsts.size() << " instructions and "
                       << ToBeChangedUses.size() << " uses\n");
 
-    SmallVector<WeakTrackingVH, 32> DeadInsts;
+    SmallVector<Instruction *, 32> DeadInsts;
     SmallVector<Instruction *, 32> TerminatorsToFold;
 
     for (auto &It : ToBeChangedUses) {
@@ -8072,7 +8072,11 @@
       }
     }
 
-    RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+    for (Instruction *I : DeadInsts) {
+      AssumptionCache *AC =
+          InfoCache.AG.getAnalysis<AssumptionAnalysis>(*I->getFunction());
+      RecursivelyDeleteTriviallyDeadInstructions(I, nullptr, nullptr, AC);
+    }
 
     if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
       SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
--- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -66,7 +66,8 @@
           }
         }
       }
-      RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI, nullptr,
+                                                 SQ.AC);
     }
 
     // Place the list of instructions to simplify on the next loop iteration
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -152,7 +152,7 @@
     // iteration over all instructions in all the loop blocks.
     if (!DeadInsts.empty()) {
       Changed = true;
-      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI, MSSAU);
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI, MSSAU, &AC);
     }
 
     if (MSSAU && VerifyMemorySSA)
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -243,7 +243,7 @@
           WeakVH NewIExist = NewI;
           // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that
           // entry will be replaced with nullptr if deleted.
-          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI, nullptr, AC);
           if (!NewIExist) {
             // Rare occation where the new instruction (NewI) have been removed,
             // probably due to parts of the input code was dead from the
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -134,8 +134,11 @@
     auto addAttrList = [&](AttributeList AttrList) {
       for (unsigned Idx = AttributeList::FirstArgIndex;
            Idx < AttrList.getNumAttrSets(); Idx++)
-        for (Attribute Attr : AttrList.getAttributes(Idx))
+        for (Attribute Attr : AttrList.getAttributes(Idx)) {
+          if (!Call->getArgOperand(Idx - 1))
+            continue;
           addAttribute(Attr, Call->getArgOperand(Idx - 1));
+        }
       for (Attribute Attr : AttrList.getFnAttributes())
         addAttribute(Attr, nullptr);
     };
@@ -198,9 +201,18 @@
       addAttr(Attribute::Alignment, Pointer, MA.valueOrOne().value());
   }
 
+  bool instructionHasNullOperand(Instruction *I) {
+    for (Value *V : I->operand_values())
+      if (!V)
+        return true;
+    return false;
+  }
+
   void addInstruction(Instruction *I) {
     if (auto *Call = dyn_cast<CallBase>(I))
       return addCall(Call);
+    if (instructionHasNullOperand(I))
+      return;
     if (auto *Load = dyn_cast<LoadInst>(I))
       return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(),
                             Load->getAlign());
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -78,6 +78,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include <algorithm>
 #include <cassert>
 #include <climits>
@@ -441,21 +442,22 @@
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
 bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
-    Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU) {
+    Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU,
+    AssumptionCache *AC) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I || !isInstructionTriviallyDead(I, TLI))
     return false;
 
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   DeadInsts.push_back(I);
-  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU, AC);
 
   return true;
 }
 
 bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU, AssumptionCache *AC) {
   unsigned S = 0, E = DeadInsts.size(), Alive = 0;
   for (; S != E; ++S) {
     auto *I = cast<Instruction>(DeadInsts[S]);
@@ -466,13 +468,13 @@
   }
   if (Alive == E)
     return false;
-  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU, AC);
   return true;
 }
 
 void llvm::RecursivelyDeleteTriviallyDeadInstructions(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU, AssumptionCache *AC) {
   // Process the dead instruction list until empty.
   while (!DeadInsts.empty()) {
     Value *V = DeadInsts.pop_back_val();
@@ -485,6 +487,7 @@
 
     // Don't lose the debug info while deleting the instructions.
     salvageDebugInfo(*I);
+    salvageKnowledge(I, AC);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
@@ -571,6 +574,7 @@
                           const TargetLibraryInfo *TLI) {
   if (isInstructionTriviallyDead(I, TLI)) {
     salvageDebugInfo(*I);
+    salvageKnowledge(I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -215,7 +215,7 @@
     while (!DeadInsts.empty()) {
       Value *V = DeadInsts.pop_back_val();
       if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+        RecursivelyDeleteTriviallyDeadInstructions(Inst, nullptr, nullptr, AC);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -203,10 +203,23 @@
   bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
   bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
   bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder);
 
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
 
+  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
+  bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                              const TargetTransformInfo &TTI);
+  bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
+                                  BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                  uint32_t TrueWeight, uint32_t FalseWeight);
+  bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+                                 const DataLayout &DL);
+  bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
+  bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
+  bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
+
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
                  SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
@@ -677,10 +690,8 @@
   }
 };
 
-} // end anonymous namespace
-
-static void EraseTerminatorAndDCECond(Instruction *TI,
-                                      MemorySSAUpdater *MSSAU = nullptr) {
+void EraseTerminatorAndDCECond(Instruction *TI, MemorySSAUpdater *MSSAU,
+                               AssumptionCache *AC) {
   Instruction *Cond = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cond = dyn_cast<Instruction>(SI->getCondition());
@@ -864,7 +875,7 @@
                         << "Through successor TI: " << *TI << "Leaving: " << *NI
                         << "\n");
 
-      EraseTerminatorAndDCECond(TI);
+      EraseTerminatorAndDCECond(TI, nullptr, Options.AC);
       return true;
     }
 
@@ -929,12 +940,10 @@
                     << "Through successor TI: " << *TI << "Leaving: " << *NI
                     << "\n");
 
-  EraseTerminatorAndDCECond(TI);
+  EraseTerminatorAndDCECond(TI, nullptr, Options.AC);
   return true;
 }
 
-namespace {
-
 /// This class implements a stable ordering of constant
 /// integers that does not depend on their address.  This is important for
 /// applications that sort ConstantInt's to ensure uniqueness.
@@ -1190,7 +1199,7 @@
         setBranchWeights(NewSI, MDWeights);
       }
 
-      EraseTerminatorAndDCECond(PTI);
+      EraseTerminatorAndDCECond(PTI, nullptr, Options.AC);
 
       // Okay, last check.  If BB is still a successor of PSI, then we must
       // have an infinite loop case.  If so, add an infinitely looping block
@@ -1236,8 +1245,8 @@
 /// Given a conditional branch that goes to BB1 and BB2, hoist any common code
 /// in the two blocks up into the branch block. The caller of this function
 /// guarantees that BI's block dominates BB1 and BB2.
-static bool HoistThenElseCodeToIf(BranchInst *BI,
-                                  const TargetTransformInfo &TTI) {
+bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
+                                           const TargetTransformInfo &TTI) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
@@ -1427,7 +1436,7 @@
   for (BasicBlock *Succ : successors(BB1))
     AddPredecessorToBlock(Succ, BIParent, BB1);
 
-  EraseTerminatorAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI, nullptr, Options.AC);
   return true;
 }
 
@@ -1970,8 +1979,8 @@
 /// \endcode
 ///
 /// \returns true if the conditional block is removed.
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
-                                   const TargetTransformInfo &TTI) {
+bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                                            const TargetTransformInfo &TTI) {
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
   if (isa<FCmpInst>(BrCond))
@@ -2463,8 +2472,8 @@
 /// If we found a conditional branch that goes to two returning blocks,
 /// try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
-                                           IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
+                                                    IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -2487,7 +2496,7 @@
     TrueSucc->removePredecessor(BI->getParent());
     FalseSucc->removePredecessor(BI->getParent());
     Builder.CreateRetVoid();
-    EraseTerminatorAndDCECond(BI);
+    EraseTerminatorAndDCECond(BI, nullptr, Options.AC);
     return true;
   }
 
@@ -2543,7 +2552,7 @@
                     << "\n  " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: "
                     << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc);
 
-  EraseTerminatorAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI, nullptr, Options.AC);
 
   return true;
 }
@@ -2588,495 +2597,188 @@
   }
 }
 
-/// If this basic block is simple enough, and if a predecessor branches to us
-/// and one of our successors, fold the block into the predecessor and use
-/// logical operations to pick the right destination.
-bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
-                                  unsigned BonusInstThreshold) {
-  BasicBlock *BB = BI->getParent();
+// If there is only one store in BB1 and BB2, return it, otherwise return
+// nullptr.
+static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
+  StoreInst *S = nullptr;
+  for (auto *BB : {BB1, BB2}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        if (S)
+          // Multiple stores seen.
+          return nullptr;
+        else
+          S = SI;
+      }
+  }
+  return S;
+}
 
-  const unsigned PredCount = pred_size(BB);
+static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
+                                              Value *AlternativeV = nullptr) {
+  // PHI is going to be a PHI node that allows the value V that is defined in
+  // BB to be referenced in BB's only successor.
+  //
+  // If AlternativeV is nullptr, the only value we care about in PHI is V. It
+  // doesn't matter to us what the other operand is (it'll never get used). We
+  // could just create a new PHI with an undef incoming value, but that could
+  // increase register pressure if EarlyCSE/InstCombine can't fold it with some
+  // other PHI. So here we directly look for some PHI in BB's successor with V
+  // as an incoming operand. If we find one, we use it, else we create a new
+  // one.
+  //
+  // If AlternativeV is not nullptr, we care about both incoming values in PHI.
+  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
+  // where OtherBB is the single other predecessor of BB's only successor.
+  PHINode *PHI = nullptr;
+  BasicBlock *Succ = BB->getSingleSuccessor();
 
-  Instruction *Cond = nullptr;
-  if (BI->isConditional())
-    Cond = dyn_cast<Instruction>(BI->getCondition());
-  else {
-    // For unconditional branch, check for a simple CFG pattern, where
-    // BB has a single predecessor and BB's successor is also its predecessor's
-    // successor. If such pattern exists, check for CSE between BB and its
-    // predecessor.
-    if (BasicBlock *PB = BB->getSinglePredecessor())
-      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
-        if (PBI->isConditional() &&
-            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
-             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
-          for (auto I = BB->instructionsWithoutDebug().begin(),
-                    E = BB->instructionsWithoutDebug().end();
-               I != E;) {
-            Instruction *Curr = &*I++;
-            if (isa<CmpInst>(Curr)) {
-              Cond = Curr;
-              break;
-            }
-            // Quit if we can't remove this instruction.
-            if (!tryCSEWithPredecessor(Curr, PB))
-              return false;
-          }
-        }
+  for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
+    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
+      PHI = cast<PHINode>(I);
+      if (!AlternativeV)
+        break;
 
-    if (!Cond)
-      return false;
-  }
+      assert(Succ->hasNPredecessors(2));
+      auto PredI = pred_begin(Succ);
+      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
+      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
+        break;
+      PHI = nullptr;
+    }
+  if (PHI)
+    return PHI;
 
-  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
-      Cond->getParent() != BB || !Cond->hasOneUse())
-    return false;
+  // If V is not an instruction defined in BB, just return it.
+  if (!AlternativeV &&
+      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
+    return V;
 
-  // Make sure the instruction after the condition is the cond branch.
-  BasicBlock::iterator CondIt = ++Cond->getIterator();
+  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
+  PHI->addIncoming(V, BB);
+  for (BasicBlock *PredBB : predecessors(Succ))
+    if (PredBB != BB)
+      PHI->addIncoming(
+          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
+  return PHI;
+}
 
-  // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(CondIt))
-    ++CondIt;
+static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
+                                           BasicBlock *QTB, BasicBlock *QFB,
+                                           BasicBlock *PostBB, Value *Address,
+                                           bool InvertPCond, bool InvertQCond,
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
+  // For every pointer, there must be exactly two stores, one coming from
+  // PTB or PFB, and the other from QTB or QFB. We don't support more than one
+  // store (to any address) in PTB,PFB or QTB,QFB.
+  // FIXME: We could relax this restriction with a bit more work and performance
+  // testing.
+  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
+  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
+  if (!PStore || !QStore)
+    return false;
 
-  if (&*CondIt != BI)
+  // Now check the stores are compatible.
+  if (!QStore->isUnordered() || !PStore->isUnordered())
     return false;
 
-  // Only allow this transformation if computing the condition doesn't involve
-  // too many instructions and these involved instructions can be executed
-  // unconditionally. We denote all involved instructions except the condition
-  // as "bonus instructions", and only allow this transformation when the
-  // number of the bonus instructions we'll need to create when cloning into
-  // each predecessor does not exceed a certain threshold.
-  unsigned NumBonusInsts = 0;
-  for (auto I = BB->begin(); Cond != &*I; ++I) {
-    // Ignore dbg intrinsics.
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
+  // Check that sinking the store won't cause program behavior changes. Sinking
+  // the store out of the Q blocks won't change any behavior as we're sinking
+  // from a block to its unconditional successor. But we're moving a store from
+  // the P blocks down through the middle block (QBI) and past both QFB and QTB.
+  // So we need to check that there are no aliasing loads or stores in
+  // QBI, QTB and QFB. We also need to check there are no conflicting memory
+  // operations between PStore and the end of its parent block.
+  //
+  // The ideal way to do this is to query AliasAnalysis, but we don't
+  // preserve AA currently so that is dangerous. Be super safe and just
+  // check there are no other memory operations at all.
+  for (auto &I : *QFB->getSinglePredecessor())
+    if (I.mayReadOrWriteMemory())
       return false;
-    // I has only one use and can be executed unconditionally.
-    Instruction *User = dyn_cast<Instruction>(I->user_back());
-    if (User == nullptr || User->getParent() != BB)
+  for (auto &I : *QFB)
+    if (&I != QStore && I.mayReadOrWriteMemory())
+      return false;
+  if (QTB)
+    for (auto &I : *QTB)
+      if (&I != QStore && I.mayReadOrWriteMemory())
+        return false;
+  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
+       I != E; ++I)
+    if (&*I != PStore && I->mayReadOrWriteMemory())
       return false;
-    // I is used in the same BB. Since BI uses Cond and doesn't have more slots
-    // to use any other instruction, User must be an instruction between next(I)
-    // and Cond.
 
-    // Account for the cost of duplicating this instruction into each
-    // predecessor.
-    NumBonusInsts += PredCount;
-    // Early exits once we reach the limit.
-    if (NumBonusInsts > BonusInstThreshold)
+  // If we're not in aggressive mode, we only optimize if we have some
+  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+  auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
+    if (!BB)
+      return true;
+    // Heuristic: if the block can be if-converted/phi-folded and the
+    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+    // thread this store.
+    int BudgetRemaining =
+        PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+    for (auto &I : BB->instructionsWithoutDebug()) {
+      // Consider terminator instruction to be free.
+      if (I.isTerminator())
+        continue;
+      // If this is one the stores that we want to speculate out of this BB,
+      // then don't count it's cost, consider it to be free.
+      if (auto *S = dyn_cast<StoreInst>(&I))
+        if (llvm::find(FreeStores, S))
+          continue;
+      // Else, we have a white-list of instructions that we are ak speculating.
+      if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
+        return false; // Not in white-list - not worthwhile folding.
+      // And finally, if this is a non-free instruction that we are okay
+      // speculating, ensure that we consider the speculation budget.
+      BudgetRemaining -= TTI.getUserCost(&I);
+      if (BudgetRemaining < 0)
+        return false; // Eagerly refuse to fold as soon as we're out of budget.
+    }
+    assert(BudgetRemaining >= 0 &&
+           "When we run out of budget we will eagerly return from within the "
+           "per-instruction loop.");
+    return true;
+  };
+
+  const SmallVector<StoreInst *, 2> FreeStores = {PStore, QStore};
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
+       !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
+    return false;
+
+  // If PostBB has more than two predecessors, we need to split it so we can
+  // sink the store.
+  if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
+    // We know that QFB's only successor is PostBB. And QFB has a single
+    // predecessor. If QTB exists, then its only successor is also PostBB.
+    // If QTB does not exist, then QFB's only predecessor has a conditional
+    // branch to QFB and PostBB.
+    BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
+    BasicBlock *NewBB =
+        SplitBlockPredecessors(PostBB, {QFB, TruePred}, "condstore.split");
+    if (!NewBB)
       return false;
+    PostBB = NewBB;
   }
 
-  // Cond is known to be a compare or binary operator.  Check to make sure that
-  // neither operand is a potentially-trapping constant expression.
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
-    if (CE->canTrap())
-      return false;
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
-    if (CE->canTrap())
-      return false;
+  // OK, we're going to sink the stores to PostBB. The store has to be
+  // conditional though, so first create the predicate.
+  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
 
-  // Finally, don't infinitely unroll conditional loops.
-  BasicBlock *TrueDest = BI->getSuccessor(0);
-  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
-  if (TrueDest == BB || FalseDest == BB)
-    return false;
+  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
+                                                PStore->getParent());
+  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
+                                                QStore->getParent(), PPHI);
 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-    BasicBlock *PredBlock = *PI;
-    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
-
-    // Check that we have two conditional branches.  If there is a PHI node in
-    // the common successor, verify that the same value flows in from both
-    // blocks.
-    SmallVector<PHINode *, 4> PHIs;
-    if (!PBI || PBI->isUnconditional() ||
-        (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) ||
-        (!BI->isConditional() &&
-         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
-      continue;
-
-    // Determine if the two branches share a common destination.
-    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
-    bool InvertPredCond = false;
-
-    if (BI->isConditional()) {
-      if (PBI->getSuccessor(0) == TrueDest) {
-        Opc = Instruction::Or;
-      } else if (PBI->getSuccessor(1) == FalseDest) {
-        Opc = Instruction::And;
-      } else if (PBI->getSuccessor(0) == FalseDest) {
-        Opc = Instruction::And;
-        InvertPredCond = true;
-      } else if (PBI->getSuccessor(1) == TrueDest) {
-        Opc = Instruction::Or;
-        InvertPredCond = true;
-      } else {
-        continue;
-      }
-    } else {
-      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
-        continue;
-    }
-
-    LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    IRBuilder<> Builder(PBI);
-
-    // If we need to invert the condition in the pred block to match, do so now.
-    if (InvertPredCond) {
-      Value *NewCond = PBI->getCondition();
-
-      if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
-        CmpInst *CI = cast<CmpInst>(NewCond);
-        CI->setPredicate(CI->getInversePredicate());
-      } else {
-        NewCond =
-            Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
-      }
-
-      PBI->setCondition(NewCond);
-      PBI->swapSuccessors();
-    }
-
-    // If we have bonus instructions, clone them into the predecessor block.
-    // Note that there may be multiple predecessor blocks, so we cannot move
-    // bonus instructions to a predecessor block.
-    ValueToValueMapTy VMap; // maps original values to cloned values
-    // We already make sure Cond is the last instruction before BI. Therefore,
-    // all instructions before Cond other than DbgInfoIntrinsic are bonus
-    // instructions.
-    for (auto BonusInst = BB->begin(); Cond != &*BonusInst; ++BonusInst) {
-      if (isa<DbgInfoIntrinsic>(BonusInst))
-        continue;
-      Instruction *NewBonusInst = BonusInst->clone();
-      RemapInstruction(NewBonusInst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      VMap[&*BonusInst] = NewBonusInst;
-
-      // If we moved a load, we cannot any longer claim any knowledge about
-      // its potential value. The previous information might have been valid
-      // only given the branch precondition.
-      // For an analogous reason, we must also drop all the metadata whose
-      // semantics we don't understand.
-      NewBonusInst->dropUnknownNonDebugMetadata();
-
-      PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
-      NewBonusInst->takeName(&*BonusInst);
-      BonusInst->setName(BonusInst->getName() + ".old");
-    }
-
-    // Clone Cond into the predecessor basic block, and or/and the
-    // two conditions together.
-    Instruction *CondInPred = Cond->clone();
-    RemapInstruction(CondInPred, VMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-    PredBlock->getInstList().insert(PBI->getIterator(), CondInPred);
-    CondInPred->takeName(Cond);
-    Cond->setName(CondInPred->getName() + ".old");
-
-    if (BI->isConditional()) {
-      Instruction *NewCond = cast<Instruction>(
-          Builder.CreateBinOp(Opc, PBI->getCondition(), CondInPred, "or.cond"));
-      PBI->setCondition(NewCond);
-
-      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-      bool HasWeights =
-          extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
-                                 SuccTrueWeight, SuccFalseWeight);
-      SmallVector<uint64_t, 8> NewWeights;
-
-      if (PBI->getSuccessor(0) == BB) {
-        if (HasWeights) {
-          // PBI: br i1 %x, BB, FalseDest
-          // BI:  br i1 %y, TrueDest, FalseDest
-          // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
-          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
-          // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
-          //               TrueWeight for PBI * FalseWeight for BI.
-          // We assume that total weights of a BranchInst can fit into 32 bits.
-          // Therefore, we will not have overflow using 64-bit arithmetic.
-          NewWeights.push_back(PredFalseWeight *
-                                   (SuccFalseWeight + SuccTrueWeight) +
-                               PredTrueWeight * SuccFalseWeight);
-        }
-        AddPredecessorToBlock(TrueDest, PredBlock, BB, MSSAU);
-        PBI->setSuccessor(0, TrueDest);
-      }
-      if (PBI->getSuccessor(1) == BB) {
-        if (HasWeights) {
-          // PBI: br i1 %x, TrueDest, BB
-          // BI:  br i1 %y, TrueDest, FalseDest
-          // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
-          //              FalseWeight for PBI * TrueWeight for BI.
-          NewWeights.push_back(PredTrueWeight *
-                                   (SuccFalseWeight + SuccTrueWeight) +
-                               PredFalseWeight * SuccTrueWeight);
-          // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
-          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
-        }
-        AddPredecessorToBlock(FalseDest, PredBlock, BB, MSSAU);
-        PBI->setSuccessor(1, FalseDest);
-      }
-      if (NewWeights.size() == 2) {
-        // Halve the weights if any of them cannot fit in an uint32_t
-        FitWeights(NewWeights);
-
-        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),
-                                           NewWeights.end());
-        setBranchWeights(PBI, MDWeights[0], MDWeights[1]);
-      } else
-        PBI->setMetadata(LLVMContext::MD_prof, nullptr);
-    } else {
-      // Update PHI nodes in the common successors.
-      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-        ConstantInt *PBI_C = cast<ConstantInt>(
-            PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
-        assert(PBI_C->getType()->isIntegerTy(1));
-        Instruction *MergedCond = nullptr;
-        if (PBI->getSuccessor(0) == TrueDest) {
-          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
-          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
-          //       is false: !PBI_Cond and BI_Value
-          Instruction *NotCond = cast<Instruction>(
-              Builder.CreateNot(PBI->getCondition(), "not.cond"));
-          MergedCond = cast<Instruction>(
-               Builder.CreateBinOp(Instruction::And, NotCond, CondInPred,
-                                   "and.cond"));
-          if (PBI_C->isOne())
-            MergedCond = cast<Instruction>(Builder.CreateBinOp(
-                Instruction::Or, PBI->getCondition(), MergedCond, "or.cond"));
-        } else {
-          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
-          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
-          //       is false: PBI_Cond and BI_Value
-          MergedCond = cast<Instruction>(Builder.CreateBinOp(
-              Instruction::And, PBI->getCondition(), CondInPred, "and.cond"));
-          if (PBI_C->isOne()) {
-            Instruction *NotCond = cast<Instruction>(
-                Builder.CreateNot(PBI->getCondition(), "not.cond"));
-            MergedCond = cast<Instruction>(Builder.CreateBinOp(
-                Instruction::Or, NotCond, MergedCond, "or.cond"));
-          }
-        }
-        // Update PHI Node.
-	PHIs[i]->setIncomingValueForBlock(PBI->getParent(), MergedCond);
-      }
-
-      // PBI is changed to branch to TrueDest below. Remove itself from
-      // potential phis from all other successors.
-      if (MSSAU)
-        MSSAU->changeCondBranchToUnconditionalTo(PBI, TrueDest);
-
-      // Change PBI from Conditional to Unconditional.
-      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
-      EraseTerminatorAndDCECond(PBI, MSSAU);
-      PBI = New_PBI;
-    }
-
-    // If BI was a loop latch, it may have had associated loop metadata.
-    // We need to copy it to the new latch, that is, PBI.
-    if (MDNode *LoopMD = BI->getMetadata(LLVMContext::MD_loop))
-      PBI->setMetadata(LLVMContext::MD_loop, LoopMD);
-
-    // TODO: If BB is reachable from all paths through PredBlock, then we
-    // could replace PBI's branch probabilities with BI's.
-
-    // Copy any debug value intrinsics into the end of PredBlock.
-    for (Instruction &I : *BB)
-      if (isa<DbgInfoIntrinsic>(I))
-        I.clone()->insertBefore(PBI);
-
-    return true;
-  }
-  return false;
-}
-
-// If there is only one store in BB1 and BB2, return it, otherwise return
-// nullptr.
-static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
-  StoreInst *S = nullptr;
-  for (auto *BB : {BB1, BB2}) {
-    if (!BB)
-      continue;
-    for (auto &I : *BB)
-      if (auto *SI = dyn_cast<StoreInst>(&I)) {
-        if (S)
-          // Multiple stores seen.
-          return nullptr;
-        else
-          S = SI;
-      }
-  }
-  return S;
-}
-
-static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
-                                              Value *AlternativeV = nullptr) {
-  // PHI is going to be a PHI node that allows the value V that is defined in
-  // BB to be referenced in BB's only successor.
-  //
-  // If AlternativeV is nullptr, the only value we care about in PHI is V. It
-  // doesn't matter to us what the other operand is (it'll never get used). We
-  // could just create a new PHI with an undef incoming value, but that could
-  // increase register pressure if EarlyCSE/InstCombine can't fold it with some
-  // other PHI. So here we directly look for some PHI in BB's successor with V
-  // as an incoming operand. If we find one, we use it, else we create a new
-  // one.
-  //
-  // If AlternativeV is not nullptr, we care about both incoming values in PHI.
-  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
-  // where OtherBB is the single other predecessor of BB's only successor.
-  PHINode *PHI = nullptr;
-  BasicBlock *Succ = BB->getSingleSuccessor();
-
-  for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
-    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
-      PHI = cast<PHINode>(I);
-      if (!AlternativeV)
-        break;
-
-      assert(Succ->hasNPredecessors(2));
-      auto PredI = pred_begin(Succ);
-      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
-      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
-        break;
-      PHI = nullptr;
-    }
-  if (PHI)
-    return PHI;
-
-  // If V is not an instruction defined in BB, just return it.
-  if (!AlternativeV &&
-      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
-    return V;
-
-  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
-  PHI->addIncoming(V, BB);
-  for (BasicBlock *PredBB : predecessors(Succ))
-    if (PredBB != BB)
-      PHI->addIncoming(
-          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
-  return PHI;
-}
-
-static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
-                                           BasicBlock *QTB, BasicBlock *QFB,
-                                           BasicBlock *PostBB, Value *Address,
-                                           bool InvertPCond, bool InvertQCond,
-                                           const DataLayout &DL,
-                                           const TargetTransformInfo &TTI) {
-  // For every pointer, there must be exactly two stores, one coming from
-  // PTB or PFB, and the other from QTB or QFB. We don't support more than one
-  // store (to any address) in PTB,PFB or QTB,QFB.
-  // FIXME: We could relax this restriction with a bit more work and performance
-  // testing.
-  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
-  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
-  if (!PStore || !QStore)
-    return false;
-
-  // Now check the stores are compatible.
-  if (!QStore->isUnordered() || !PStore->isUnordered())
-    return false;
-
-  // Check that sinking the store won't cause program behavior changes. Sinking
-  // the store out of the Q blocks won't change any behavior as we're sinking
-  // from a block to its unconditional successor. But we're moving a store from
-  // the P blocks down through the middle block (QBI) and past both QFB and QTB.
-  // So we need to check that there are no aliasing loads or stores in
-  // QBI, QTB and QFB. We also need to check there are no conflicting memory
-  // operations between PStore and the end of its parent block.
-  //
-  // The ideal way to do this is to query AliasAnalysis, but we don't
-  // preserve AA currently so that is dangerous. Be super safe and just
-  // check there are no other memory operations at all.
-  for (auto &I : *QFB->getSinglePredecessor())
-    if (I.mayReadOrWriteMemory())
-      return false;
-  for (auto &I : *QFB)
-    if (&I != QStore && I.mayReadOrWriteMemory())
-      return false;
-  if (QTB)
-    for (auto &I : *QTB)
-      if (&I != QStore && I.mayReadOrWriteMemory())
-        return false;
-  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
-       I != E; ++I)
-    if (&*I != PStore && I->mayReadOrWriteMemory())
-      return false;
-
-  // If we're not in aggressive mode, we only optimize if we have some
-  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
-  auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
-    if (!BB)
-      return true;
-    // Heuristic: if the block can be if-converted/phi-folded and the
-    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
-    // thread this store.
-    int BudgetRemaining =
-        PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
-    for (auto &I : BB->instructionsWithoutDebug()) {
-      // Consider terminator instruction to be free.
-      if (I.isTerminator())
-        continue;
-      // If this is one the stores that we want to speculate out of this BB,
-      // then don't count it's cost, consider it to be free.
-      if (auto *S = dyn_cast<StoreInst>(&I))
-        if (llvm::find(FreeStores, S))
-          continue;
-      // Else, we have a white-list of instructions that we are ak speculating.
-      if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
-        return false; // Not in white-list - not worthwhile folding.
-      // And finally, if this is a non-free instruction that we are okay
-      // speculating, ensure that we consider the speculation budget.
-      BudgetRemaining -= TTI.getUserCost(&I);
-      if (BudgetRemaining < 0)
-        return false; // Eagerly refuse to fold as soon as we're out of budget.
-    }
-    assert(BudgetRemaining >= 0 &&
-           "When we run out of budget we will eagerly return from within the "
-           "per-instruction loop.");
-    return true;
-  };
-
-  const SmallVector<StoreInst *, 2> FreeStores = {PStore, QStore};
-  if (!MergeCondStoresAggressively &&
-      (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
-       !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
-    return false;
-
-  // If PostBB has more than two predecessors, we need to split it so we can
-  // sink the store.
-  if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
-    // We know that QFB's only successor is PostBB. And QFB has a single
-    // predecessor. If QTB exists, then its only successor is also PostBB.
-    // If QTB does not exist, then QFB's only predecessor has a conditional
-    // branch to QFB and PostBB.
-    BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
-    BasicBlock *NewBB = SplitBlockPredecessors(PostBB, { QFB, TruePred},
-                                               "condstore.split");
-    if (!NewBB)
-      return false;
-    PostBB = NewBB;
-  }
-
-  // OK, we're going to sink the stores to PostBB. The store has to be
-  // conditional though, so first create the predicate.
-  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
-                     ->getCondition();
-  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
-                     ->getCondition();
-
-  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
-                                                PStore->getParent());
-  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
-                                                QStore->getParent(), PPHI);
-
-  IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
+  IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
 
   Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
   Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
@@ -3523,10 +3225,11 @@
 // Takes care of updating the successors and removing the old terminator.
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
-static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
-                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
-                                       uint32_t TrueWeight,
-                                       uint32_t FalseWeight) {
+bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
+                                                Value *Cond, BasicBlock *TrueBB,
+                                                BasicBlock *FalseBB,
+                                                uint32_t TrueWeight,
+                                                uint32_t FalseWeight) {
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -3578,7 +3281,7 @@
       Builder.CreateBr(FalseBB);
   }
 
-  EraseTerminatorAndDCECond(OldTerm);
+  EraseTerminatorAndDCECond(OldTerm, nullptr, Options.AC);
   return true;
 }
 
@@ -3586,7 +3289,8 @@
 //   (switch (select cond, X, Y)) on constant X, Y
 // with a branch - conditional if X and Y lead to distinct BBs,
 // unconditional otherwise.
-static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
+bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI,
+                                            SelectInst *Select) {
   // Check for constant integer values in the select.
   ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
   ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
@@ -3622,7 +3326,8 @@
 //                             blockaddress(@fn, BlockB)))
 // with
 //   (br cond, BlockA, BlockB).
-static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
+bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI,
+                                                SelectInst *SI) {
   // Check that both operands of the select are block addresses.
   BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
   BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
@@ -3757,8 +3462,9 @@
 /// The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
-                                      const DataLayout &DL) {
+bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
+                                               IRBuilder<> &Builder,
+                                               const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (!Cond)
     return false;
@@ -3866,7 +3572,7 @@
   }
 
   // Erase the old branch instruction.
-  EraseTerminatorAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI, nullptr, Options.AC);
 
   LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
@@ -4307,7 +4013,7 @@
           Builder.CreateAssumption(Cond);
           Builder.CreateBr(BI->getSuccessor(0));
         }
-        EraseTerminatorAndDCECond(BI);
+        EraseTerminatorAndDCECond(BI, nullptr, Options.AC);
         Changed = true;
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
@@ -4390,7 +4096,7 @@
   return true;
 }
 
-static void createUnreachableSwitchDefault(SwitchInst *Switch) {
+void createUnreachableSwitchDefault(SwitchInst *Switch, AssumptionCache *AC) {
   LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
   BasicBlock *NewDefaultBlock =
      SplitBlockPredecessors(Switch->getDefaultDest(), Switch->getParent(), "");
@@ -4398,12 +4104,13 @@
   SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front());
   auto *NewTerminator = NewDefaultBlock->getTerminator();
   new UnreachableInst(Switch->getContext(), NewTerminator);
-  EraseTerminatorAndDCECond(NewTerminator);
+  EraseTerminatorAndDCECond(NewTerminator, nullptr, AC);
 }
 
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
-static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
+                                             IRBuilder<> &Builder) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   bool HasDefault =
@@ -4512,7 +4219,7 @@
   // Clean up the default block - it may have phis or other instructions before
   // the unreachable terminator.
   if (!HasDefault)
-    createUnreachableSwitchDefault(SI);
+    createUnreachableSwitchDefault(SI, Options.AC);
 
   // Drop the switch.
   SI->eraseFromParent();
@@ -4558,7 +4265,7 @@
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    createUnreachableSwitchDefault(SI);
+    createUnreachableSwitchDefault(SI, AC);
     return true;
   }
 
@@ -5641,561 +5348,869 @@
   if (isSwitchDense(Values))
     return false;
 
-  // First, transform the values such that they start at zero and ascend.
-  int64_t Base = Values[0];
-  for (auto &V : Values)
-    V -= (uint64_t)(Base);
+  // First, transform the values such that they start at zero and ascend.
+  int64_t Base = Values[0];
+  for (auto &V : Values)
+    V -= (uint64_t)(Base);
+
+  // Now we have signed numbers that have been shifted so that, given enough
+  // precision, there are no negative values. Since the rest of the transform
+  // is bitwise only, we switch now to an unsigned representation.
+
+  // This transform can be done speculatively because it is so cheap - it
+  // results in a single rotate operation being inserted.
+  // FIXME: It's possible that optimizing a switch on powers of two might also
+  // be beneficial - flag values are often powers of two and we could use a CLZ
+  // as the key function.
+
+  // countTrailingZeros(0) returns 64. As Values is guaranteed to have more than
+  // one element and LLVM disallows duplicate cases, Shift is guaranteed to be
+  // less than 64.
+  unsigned Shift = 64;
+  for (auto &V : Values)
+    Shift = std::min(Shift, countTrailingZeros((uint64_t)V));
+  assert(Shift < 64);
+  if (Shift > 0)
+    for (auto &V : Values)
+      V = (int64_t)((uint64_t)V >> Shift);
+
+  if (!isSwitchDense(Values))
+    // Transform didn't create a dense switch.
+    return false;
+
+  // The obvious transform is to shift the switch condition right and emit a
+  // check that the condition actually cleanly divided by GCD, i.e.
+  //   C & (1 << Shift - 1) == 0
+  // inserting a new CFG edge to handle the case where it didn't divide cleanly.
+  //
+  // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
+  // shift and puts the shifted-off bits in the uppermost bits. If any of these
+  // are nonzero then the switch condition will be very large and will hit the
+  // default case.
+
+  auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
+  Builder.SetInsertPoint(SI);
+  auto *ShiftC = ConstantInt::get(Ty, Shift);
+  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+  auto *LShr = Builder.CreateLShr(Sub, ShiftC);
+  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
+  auto *Rot = Builder.CreateOr(LShr, Shl);
+  SI->replaceUsesOfWith(SI->getCondition(), Rot);
+
+  for (auto Case : SI->cases()) {
+    auto *Orig = Case.getCaseValue();
+    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
+    Case.setValue(
+        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+  }
+  return true;
+}
+
+bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
+  BasicBlock *BB = SI->getParent();
+
+  if (isValueEqualityComparison(SI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+        return requestResimplify();
+
+    Value *Cond = SI->getCondition();
+    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+      if (SimplifySwitchOnSelect(SI, Select))
+        return requestResimplify();
+
+    // If the block only contains the switch, see if we can fold the block
+    // away into any preds.
+    if (SI == &*BB->instructionsWithoutDebug().begin())
+      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+        return requestResimplify();
+  }
+
+  // Try to transform the switch into an icmp and a branch.
+  if (TurnSwitchRangeIntoICmp(SI, Builder))
+    return requestResimplify();
+
+  // Remove unreachable cases.
+  if (eliminateDeadSwitchCases(SI, Options.AC, DL))
+    return requestResimplify();
+
+  if (switchToSelect(SI, Builder, DL, TTI))
+    return requestResimplify();
+
+  if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
+    return requestResimplify();
+
+  // The conversion from switch to lookup tables results in difficult-to-analyze
+  // code and makes pruning branches much harder. This is a problem if the
+  // switch expression itself can still be restricted as a result of inlining or
+  // CVP. Therefore, only apply this transformation during late stages of the
+  // optimisation pipeline.
+  if (Options.ConvertSwitchToLookupTable &&
+      SwitchToLookupTable(SI, Builder, DL, TTI))
+    return requestResimplify();
+
+  if (ReduceSwitchRange(SI, Builder, DL, TTI))
+    return requestResimplify();
+
+  return false;
+}
+
+bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
+  BasicBlock *BB = IBI->getParent();
+  bool Changed = false;
+
+  // Eliminate redundant destinations.
+  SmallPtrSet<Value *, 8> Succs;
+  for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+    BasicBlock *Dest = IBI->getDestination(i);
+    if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
+      Dest->removePredecessor(BB);
+      IBI->removeDestination(i);
+      --i;
+      --e;
+      Changed = true;
+    }
+  }
+
+  if (IBI->getNumDestinations() == 0) {
+    // If the indirectbr has no successors, change it to unreachable.
+    new UnreachableInst(IBI->getContext(), IBI);
+    EraseTerminatorAndDCECond(IBI, nullptr, Options.AC);
+    return true;
+  }
+
+  if (IBI->getNumDestinations() == 1) {
+    // If the indirectbr has one successor, change it to a direct branch.
+    BranchInst::Create(IBI->getDestination(0), IBI);
+    EraseTerminatorAndDCECond(IBI, nullptr, Options.AC);
+    return true;
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
+    if (SimplifyIndirectBrOnSelect(IBI, SI))
+      return requestResimplify();
+  }
+  return Changed;
+}
+
+/// Given an block with only a single landing pad and a unconditional branch
+/// try to find another basic block which this one can be merged with.  This
+/// handles cases where we have multiple invokes with unique landing pads, but
+/// a shared handler.
+///
+/// We specifically choose to not worry about merging non-empty blocks
+/// here.  That is a PRE/scheduling problem and is best solved elsewhere.  In
+/// practice, the optimizer produces empty landing pad blocks quite frequently
+/// when dealing with exception dense code.  (see: instcombine, gvn, if-else
+/// sinking in this file)
+///
+/// This is primarily a code size optimization.  We need to avoid performing
+/// any transform which might inhibit optimization (such as our ability to
+/// specialize a particular handler via tail commoning).  We do this by not
+/// merging any blocks which require us to introduce a phi.  Since the same
+/// values are flowing through both blocks, we don't lose any ability to
+/// specialize.  If anything, we make such specialization more likely.
+///
+/// TODO - This transformation could remove entries from a phi in the target
+/// block when the inputs in the phi are the same for the two blocks being
+/// merged.  In some cases, this could result in removal of the PHI entirely.
+static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
+                                 BasicBlock *BB) {
+  auto Succ = BB->getUniqueSuccessor();
+  assert(Succ);
+  // If there's a phi in the successor block, we'd likely have to introduce
+  // a phi into the merged landing pad block.
+  if (isa<PHINode>(*Succ->begin()))
+    return false;
+
+  for (BasicBlock *OtherPred : predecessors(Succ)) {
+    if (BB == OtherPred)
+      continue;
+    BasicBlock::iterator I = OtherPred->begin();
+    LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
+    if (!LPad2 || !LPad2->isIdenticalTo(LPad))
+      continue;
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+      ;
+    BranchInst *BI2 = dyn_cast<BranchInst>(I);
+    if (!BI2 || !BI2->isIdenticalTo(BI))
+      continue;
+
+    // We've found an identical block.  Update our predecessors to take that
+    // path instead and make ourselves dead.
+    SmallPtrSet<BasicBlock *, 16> Preds;
+    Preds.insert(pred_begin(BB), pred_end(BB));
+    for (BasicBlock *Pred : Preds) {
+      InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
+      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
+             "unexpected successor");
+      II->setUnwindDest(OtherPred);
+    }
+
+    // The debug info in OtherPred doesn't cover the merged control flow that
+    // used to go through BB.  We need to delete it or update it.
+    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
+      Instruction &Inst = *I;
+      I++;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        Inst.eraseFromParent();
+    }
+
+    SmallPtrSet<BasicBlock *, 16> Succs;
+    Succs.insert(succ_begin(BB), succ_end(BB));
+    for (BasicBlock *Succ : Succs) {
+      Succ->removePredecessor(BB);
+    }
 
-  // Now we have signed numbers that have been shifted so that, given enough
-  // precision, there are no negative values. Since the rest of the transform
-  // is bitwise only, we switch now to an unsigned representation.
+    IRBuilder<> Builder(BI);
+    Builder.CreateUnreachable();
+    BI->eraseFromParent();
+    return true;
+  }
+  return false;
+}
 
-  // This transform can be done speculatively because it is so cheap - it
-  // results in a single rotate operation being inserted.
-  // FIXME: It's possible that optimizing a switch on powers of two might also
-  // be beneficial - flag values are often powers of two and we could use a CLZ
-  // as the key function.
+bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) {
+  return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder)
+                                   : simplifyCondBranch(Branch, Builder);
+}
 
-  // countTrailingZeros(0) returns 64. As Values is guaranteed to have more than
-  // one element and LLVM disallows duplicate cases, Shift is guaranteed to be
-  // less than 64.
-  unsigned Shift = 64;
-  for (auto &V : Values)
-    Shift = std::min(Shift, countTrailingZeros((uint64_t)V));
-  assert(Shift < 64);
-  if (Shift > 0)
-    for (auto &V : Values)
-      V = (int64_t)((uint64_t)V >> Shift);
+bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
+                                          IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *Succ = BI->getSuccessor(0);
 
-  if (!isSwitchDense(Values))
-    // Transform didn't create a dense switch.
-    return false;
+  // If the Terminator is the only non-phi instruction, simplify the block.
+  // If LoopHeader is provided, check if the block or its successor is a loop
+  // header. (This is for early invocations before loop simplify and
+  // vectorization to keep canonical loop forms for nested loops. These blocks
+  // can be eliminated when the pass is invoked later in the back-end.)
+  // Note that if BB has only one predecessor then we do not introduce new
+  // backedge, so we can eliminate BB.
+  bool NeedCanonicalLoop =
+      Options.NeedCanonicalLoop &&
+      (LoopHeaders && BB->hasNPredecessorsOrMore(2) &&
+       (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
+  if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
+    return true;
 
-  // The obvious transform is to shift the switch condition right and emit a
-  // check that the condition actually cleanly divided by GCD, i.e.
-  //   C & (1 << Shift - 1) == 0
-  // inserting a new CFG edge to handle the case where it didn't divide cleanly.
-  //
-  // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
-  // shift and puts the shifted-off bits in the uppermost bits. If any of these
-  // are nonzero then the switch condition will be very large and will hit the
-  // default case.
+  // If the only instruction in the block is a seteq/setne comparison against a
+  // constant, try to simplify the block.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+    if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
+      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+        ;
+      if (I->isTerminator() &&
+          tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
+        return true;
+    }
 
-  auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
-  Builder.SetInsertPoint(SI);
-  auto *ShiftC = ConstantInt::get(Ty, Shift);
-  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
-  auto *LShr = Builder.CreateLShr(Sub, ShiftC);
-  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
-  auto *Rot = Builder.CreateOr(LShr, Shl);
-  SI->replaceUsesOfWith(SI->getCondition(), Rot);
+  // See if we can merge an empty landing pad block with another which is
+  // equivalent.
+  if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+      ;
+    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB))
+      return true;
+  }
 
-  for (auto Case : SI->cases()) {
-    auto *Orig = Case.getCaseValue();
-    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
-    Case.setValue(
-        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and our successor, fold the comparison into the
+  // predecessor and use logical operations to update the incoming value
+  // for PHI nodes in common successor.
+  if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold))
+    return requestResimplify();
+  return false;
+}
+
+static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
+  BasicBlock *PredPred = nullptr;
+  for (auto *P : predecessors(BB)) {
+    BasicBlock *PPred = P->getSinglePredecessor();
+    if (!PPred || (PredPred && PredPred != PPred))
+      return nullptr;
+    PredPred = PPred;
   }
-  return true;
+  return PredPred;
 }
 
-bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
-  BasicBlock *BB = SI->getParent();
+bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+  const Function *Fn = BB->getParent();
+  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+    return false;
 
-  if (isValueEqualityComparison(SI)) {
+  // Conditional branch
+  if (isValueEqualityComparison(BI)) {
     // If we only have one predecessor, and if it is a branch on this value,
-    // see if that predecessor totally determines the outcome of this switch.
+    // see if that predecessor totally determines the outcome of this
+    // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return requestResimplify();
 
-    Value *Cond = SI->getCondition();
-    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
-      if (SimplifySwitchOnSelect(SI, Select))
+    // This block must be empty, except for the setcond inst, if it exists.
+    // Ignore dbg intrinsics.
+    auto I = BB->instructionsWithoutDebug().begin();
+    if (&*I == BI) {
+      if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return requestResimplify();
-
-    // If the block only contains the switch, see if we can fold the block
-    // away into any preds.
-    if (SI == &*BB->instructionsWithoutDebug().begin())
-      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+    } else if (&*I == cast<Instruction>(BI->getCondition())) {
+      ++I;
+      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
         return requestResimplify();
+    }
   }
 
-  // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI, Builder))
-    return requestResimplify();
+  // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
+  if (SimplifyBranchOnICmpChain(BI, Builder, DL))
+    return true;
 
-  // Remove unreachable cases.
-  if (eliminateDeadSwitchCases(SI, Options.AC, DL))
+  // If this basic block has dominating predecessor blocks and the dominating
+  // blocks' conditions imply BI's condition, we know the direction of BI.
+  Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL);
+  if (Imp) {
+    // Turn this into a branch on constant.
+    auto *OldCond = BI->getCondition();
+    ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext())
+                             : ConstantInt::getFalse(BB->getContext());
+    BI->setCondition(TorF);
+    RecursivelyDeleteTriviallyDeadInstructions(OldCond, nullptr, nullptr,
+                                               Options.AC);
     return requestResimplify();
+  }
 
-  if (switchToSelect(SI, Builder, DL, TTI))
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and one of our successors, fold the comparison into the
+  // predecessor and use logical operations to pick the right destination.
+  if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold))
     return requestResimplify();
 
-  if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
-    return requestResimplify();
+  // We have a conditional branch to two blocks that are only reachable
+  // from BI.  We know that the condbr dominates the two blocks, so see if
+  // there is any identical code in the "then" and "else" blocks.  If so, we
+  // can hoist it up to the branching block.
+  if (BI->getSuccessor(0)->getSinglePredecessor()) {
+    if (BI->getSuccessor(1)->getSinglePredecessor()) {
+      if (HoistThenElseCodeToIf(BI, TTI))
+        return requestResimplify();
+    } else {
+      // If Successor #1 has multiple preds, we may be able to conditionally
+      // execute Successor #0 if it branches to Successor #1.
+      Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
+      if (Succ0TI->getNumSuccessors() == 1 &&
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
+          return requestResimplify();
+    }
+  } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
+    // If Successor #0 has multiple preds, we may be able to conditionally
+    // execute Successor #1 if it branches to Successor #0.
+    Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
+    if (Succ1TI->getNumSuccessors() == 1 &&
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
+        return requestResimplify();
+  }
+
+  // If this is a branch on a phi node in the current block, thread control
+  // through this block if any PHI node entries are constants.
+  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+    if (PN->getParent() == BI->getParent())
+      if (FoldCondBranchOnPHI(BI, DL, Options.AC))
+        return requestResimplify();
+
+  // Scan predecessor blocks for conditional branches.
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      if (PBI != BI && PBI->isConditional())
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
+          return requestResimplify();
+
+  // Look for diamond patterns.
+  if (MergeCondStores)
+    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
+        if (PBI != BI && PBI->isConditional())
+          if (mergeConditionalStores(PBI, BI, DL, TTI))
+            return requestResimplify();
+
+  return false;
+}
+
+/// Check if passing a value to an instruction will cause undefined behavior.
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (I->use_empty())
+    return false;
+
+  if (C->isNullValue() || isa<UndefValue>(C)) {
+    // Only look at the first use, avoid hurting compile time with long uselists
+    User *Use = *I->user_begin();
+
+    // Now make sure that there are no instructions in between that can alter
+    // control flow (eg. calls)
+    for (BasicBlock::iterator
+             i = ++BasicBlock::iterator(I),
+             UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
+         i != UI; ++i)
+      if (i == I->getParent()->end() || i->mayHaveSideEffects())
+        return false;
+
+    // Look through GEPs. A load from a GEP derived from NULL is still undefined
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+      if (GEP->getPointerOperand() == I)
+        return passingValueIsAlwaysUndefined(V, GEP);
+
+    // Look through bitcasts.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
+      return passingValueIsAlwaysUndefined(V, BC);
+
+    // Load from null is undefined.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+      if (!LI->isVolatile())
+        return !NullPointerIsDefined(LI->getFunction(),
+                                     LI->getPointerAddressSpace());
 
-  // The conversion from switch to lookup tables results in difficult-to-analyze
-  // code and makes pruning branches much harder. This is a problem if the
-  // switch expression itself can still be restricted as a result of inlining or
-  // CVP. Therefore, only apply this transformation during late stages of the
-  // optimisation pipeline.
-  if (Options.ConvertSwitchToLookupTable &&
-      SwitchToLookupTable(SI, Builder, DL, TTI))
-    return requestResimplify();
+    // Store to null is undefined.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+      if (!SI->isVolatile())
+        return (!NullPointerIsDefined(SI->getFunction(),
+                                      SI->getPointerAddressSpace())) &&
+               SI->getPointerOperand() == I;
 
-  if (ReduceSwitchRange(SI, Builder, DL, TTI))
-    return requestResimplify();
+    // A call to null is undefined.
+    if (auto CS = CallSite(Use))
+      return !NullPointerIsDefined(CS->getFunction()) &&
+             CS.getCalledValue() == I;
+  }
+  return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer dereference), remove the branch leading here.
+static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
+  for (PHINode &PHI : BB->phis())
+    for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
+        Instruction *T = PHI.getIncomingBlock(i)->getTerminator();
+        IRBuilder<> Builder(T);
+        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+          BB->removePredecessor(PHI.getIncomingBlock(i));
+          // Turn uncoditional branches into unreachables and remove the dead
+          // destination from conditional branches.
+          if (BI->isUnconditional())
+            Builder.CreateUnreachable();
+          else
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+                                                       : BI->getSuccessor(0));
+          BI->eraseFromParent();
+          return true;
+        }
+        // TODO: SwitchInst.
+      }
 
   return false;
 }
 
-bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
-  BasicBlock *BB = IBI->getParent();
+bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   bool Changed = false;
 
-  // Eliminate redundant destinations.
-  SmallPtrSet<Value *, 8> Succs;
-  for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
-    BasicBlock *Dest = IBI->getDestination(i);
-    if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
-      Dest->removePredecessor(BB);
-      IBI->removeDestination(i);
-      --i;
-      --e;
-      Changed = true;
-    }
-  }
-
-  if (IBI->getNumDestinations() == 0) {
-    // If the indirectbr has no successors, change it to unreachable.
-    new UnreachableInst(IBI->getContext(), IBI);
-    EraseTerminatorAndDCECond(IBI);
-    return true;
-  }
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
 
-  if (IBI->getNumDestinations() == 1) {
-    // If the indirectbr has one successor, change it to a direct branch.
-    BranchInst::Create(IBI->getDestination(0), IBI);
-    EraseTerminatorAndDCECond(IBI);
+  // Remove basic blocks that have no predecessors (except the entry block)...
+  // or that just have themself as a predecessor.  These are unreachable.
+  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
+      BB->getSinglePredecessor() == BB) {
+    LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
+    DeleteDeadBlock(BB);
     return true;
   }
 
-  if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
-    if (SimplifyIndirectBrOnSelect(IBI, SI))
-      return requestResimplify();
-  }
-  return Changed;
-}
-
-/// Given an block with only a single landing pad and a unconditional branch
-/// try to find another basic block which this one can be merged with.  This
-/// handles cases where we have multiple invokes with unique landing pads, but
-/// a shared handler.
-///
-/// We specifically choose to not worry about merging non-empty blocks
-/// here.  That is a PRE/scheduling problem and is best solved elsewhere.  In
-/// practice, the optimizer produces empty landing pad blocks quite frequently
-/// when dealing with exception dense code.  (see: instcombine, gvn, if-else
-/// sinking in this file)
-///
-/// This is primarily a code size optimization.  We need to avoid performing
-/// any transform which might inhibit optimization (such as our ability to
-/// specialize a particular handler via tail commoning).  We do this by not
-/// merging any blocks which require us to introduce a phi.  Since the same
-/// values are flowing through both blocks, we don't lose any ability to
-/// specialize.  If anything, we make such specialization more likely.
-///
-/// TODO - This transformation could remove entries from a phi in the target
-/// block when the inputs in the phi are the same for the two blocks being
-/// merged.  In some cases, this could result in removal of the PHI entirely.
-static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
-                                 BasicBlock *BB) {
-  auto Succ = BB->getUniqueSuccessor();
-  assert(Succ);
-  // If there's a phi in the successor block, we'd likely have to introduce
-  // a phi into the merged landing pad block.
-  if (isa<PHINode>(*Succ->begin()))
-    return false;
-
-  for (BasicBlock *OtherPred : predecessors(Succ)) {
-    if (BB == OtherPred)
-      continue;
-    BasicBlock::iterator I = OtherPred->begin();
-    LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
-    if (!LPad2 || !LPad2->isIdenticalTo(LPad))
-      continue;
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
-    BranchInst *BI2 = dyn_cast<BranchInst>(I);
-    if (!BI2 || !BI2->isIdenticalTo(BI))
-      continue;
-
-    // We've found an identical block.  Update our predecessors to take that
-    // path instead and make ourselves dead.
-    SmallPtrSet<BasicBlock *, 16> Preds;
-    Preds.insert(pred_begin(BB), pred_end(BB));
-    for (BasicBlock *Pred : Preds) {
-      InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
-      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
-             "unexpected successor");
-      II->setUnwindDest(OtherPred);
-    }
+  // Check to see if we can constant propagate this terminator instruction
+  // away...
+  Changed |= ConstantFoldTerminator(BB, true);
 
-    // The debug info in OtherPred doesn't cover the merged control flow that
-    // used to go through BB.  We need to delete it or update it.
-    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
-      Instruction &Inst = *I;
-      I++;
-      if (isa<DbgInfoIntrinsic>(Inst))
-        Inst.eraseFromParent();
-    }
+  // Check for and eliminate duplicate PHI nodes in this block.
+  Changed |= EliminateDuplicatePHINodes(BB);
 
-    SmallPtrSet<BasicBlock *, 16> Succs;
-    Succs.insert(succ_begin(BB), succ_end(BB));
-    for (BasicBlock *Succ : Succs) {
-      Succ->removePredecessor(BB);
-    }
+  // Check for and remove branches that will always cause undefined behavior.
+  Changed |= removeUndefIntroducingPredecessor(BB);
 
-    IRBuilder<> Builder(BI);
-    Builder.CreateUnreachable();
-    BI->eraseFromParent();
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  if (MergeBlockIntoPredecessor(BB))
     return true;
-  }
-  return false;
-}
-
-bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) {
-  return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder)
-                                   : simplifyCondBranch(Branch, Builder);
-}
 
-bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
-                                          IRBuilder<> &Builder) {
-  BasicBlock *BB = BI->getParent();
-  BasicBlock *Succ = BI->getSuccessor(0);
+  if (SinkCommon && Options.SinkCommonInsts)
+    Changed |= SinkCommonCodeFromPredecessors(BB);
 
-  // If the Terminator is the only non-phi instruction, simplify the block.
-  // If LoopHeader is provided, check if the block or its successor is a loop
-  // header. (This is for early invocations before loop simplify and
-  // vectorization to keep canonical loop forms for nested loops. These blocks
-  // can be eliminated when the pass is invoked later in the back-end.)
-  // Note that if BB has only one predecessor then we do not introduce new
-  // backedge, so we can eliminate BB.
-  bool NeedCanonicalLoop =
-      Options.NeedCanonicalLoop &&
-      (LoopHeaders && BB->hasNPredecessorsOrMore(2) &&
-       (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
-  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
-  if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
-      !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
-    return true;
+  IRBuilder<> Builder(BB);
 
-  // If the only instruction in the block is a seteq/setne comparison against a
-  // constant, try to simplify the block.
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
-    if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
-      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-        ;
-      if (I->isTerminator() &&
-          tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
-        return true;
-    }
+  // If there is a trivial two-entry PHI node in this basic block, and we can
+  // eliminate it, do so now.
+  if (auto *PN = dyn_cast<PHINode>(BB->begin()))
+    if (PN->getNumIncomingValues() == 2)
+      Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
 
-  // See if we can merge an empty landing pad block with another which is
-  // equivalent.
-  if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
-    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB))
-      return true;
+  Instruction *Terminator = BB->getTerminator();
+  Builder.SetInsertPoint(Terminator);
+  switch (Terminator->getOpcode()) {
+  case Instruction::Br:
+    Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder);
+    break;
+  case Instruction::Ret:
+    Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder);
+    break;
+  case Instruction::Resume:
+    Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder);
+    break;
+  case Instruction::CleanupRet:
+    Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator));
+    break;
+  case Instruction::Switch:
+    Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder);
+    break;
+  case Instruction::Unreachable:
+    Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator));
+    break;
+  case Instruction::IndirectBr:
+    Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator));
+    break;
   }
 
-  // If this basic block is ONLY a compare and a branch, and if a predecessor
-  // branches to us and our successor, fold the comparison into the
-  // predecessor and use logical operations to update the incoming value
-  // for PHI nodes in common successor.
-  if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold))
-    return requestResimplify();
-  return false;
+  return Changed;
 }
 
-static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
-  BasicBlock *PredPred = nullptr;
-  for (auto *P : predecessors(BB)) {
-    BasicBlock *PPred = P->getSinglePredecessor();
-    if (!PPred || (PredPred && PredPred != PPred))
-      return nullptr;
-    PredPred = PPred;
-  }
-  return PredPred;
+bool SimplifyCFGOpt::run(BasicBlock *BB) {
+  bool Changed = false;
+
+  // Repeated simplify BB as long as resimplification is requested.
+  do {
+    Resimplify = false;
+
+    // Perform one round of simplifcation. Resimplify flag will be set if
+    // another iteration is requested.
+    Changed |= simplifyOnce(BB);
+  } while (Resimplify);
+
+  return Changed;
 }
 
-bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+/// If this basic block is simple enough, and if a predecessor branches to us
+/// and one of our successors, fold the block into the predecessor and use
+/// logical operations to pick the right destination.
+bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
+                                  unsigned BonusInstThreshold,
+                                  AssumptionCache *AC) {
   BasicBlock *BB = BI->getParent();
-  const Function *Fn = BB->getParent();
-  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
-    return false;
-
-  // Conditional branch
-  if (isValueEqualityComparison(BI)) {
-    // If we only have one predecessor, and if it is a branch on this value,
-    // see if that predecessor totally determines the outcome of this
-    // switch.
-    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
-        return requestResimplify();
 
-    // This block must be empty, except for the setcond inst, if it exists.
-    // Ignore dbg intrinsics.
-    auto I = BB->instructionsWithoutDebug().begin();
-    if (&*I == BI) {
-      if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return requestResimplify();
-    } else if (&*I == cast<Instruction>(BI->getCondition())) {
-      ++I;
-      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return requestResimplify();
-    }
-  }
+  const unsigned PredCount = pred_size(BB);
 
-  // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
-  if (SimplifyBranchOnICmpChain(BI, Builder, DL))
-    return true;
+  Instruction *Cond = nullptr;
+  if (BI->isConditional())
+    Cond = dyn_cast<Instruction>(BI->getCondition());
+  else {
+    // For unconditional branch, check for a simple CFG pattern, where
+    // BB has a single predecessor and BB's successor is also its predecessor's
+    // successor. If such pattern exists, check for CSE between BB and its
+    // predecessor.
+    if (BasicBlock *PB = BB->getSinglePredecessor())
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
+        if (PBI->isConditional() &&
+            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
+             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
+          for (auto I = BB->instructionsWithoutDebug().begin(),
+                    E = BB->instructionsWithoutDebug().end();
+               I != E;) {
+            Instruction *Curr = &*I++;
+            if (isa<CmpInst>(Curr)) {
+              Cond = Curr;
+              break;
+            }
+            // Quit if we can't remove this instruction.
+            if (!tryCSEWithPredecessor(Curr, PB))
+              return false;
+          }
+        }
 
-  // If this basic block has dominating predecessor blocks and the dominating
-  // blocks' conditions imply BI's condition, we know the direction of BI.
-  Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL);
-  if (Imp) {
-    // Turn this into a branch on constant.
-    auto *OldCond = BI->getCondition();
-    ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext())
-                             : ConstantInt::getFalse(BB->getContext());
-    BI->setCondition(TorF);
-    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-    return requestResimplify();
+    if (!Cond)
+      return false;
   }
 
-  // If this basic block is ONLY a compare and a branch, and if a predecessor
-  // branches to us and one of our successors, fold the comparison into the
-  // predecessor and use logical operations to pick the right destination.
-  if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold))
-    return requestResimplify();
+  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+      Cond->getParent() != BB || !Cond->hasOneUse())
+    return false;
 
-  // We have a conditional branch to two blocks that are only reachable
-  // from BI.  We know that the condbr dominates the two blocks, so see if
-  // there is any identical code in the "then" and "else" blocks.  If so, we
-  // can hoist it up to the branching block.
-  if (BI->getSuccessor(0)->getSinglePredecessor()) {
-    if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistThenElseCodeToIf(BI, TTI))
-        return requestResimplify();
-    } else {
-      // If Successor #1 has multiple preds, we may be able to conditionally
-      // execute Successor #0 if it branches to Successor #1.
-      Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
-      if (Succ0TI->getNumSuccessors() == 1 &&
-          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
-        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
-          return requestResimplify();
-    }
-  } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
-    // If Successor #0 has multiple preds, we may be able to conditionally
-    // execute Successor #1 if it branches to Successor #0.
-    Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
-    if (Succ1TI->getNumSuccessors() == 1 &&
-        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
-      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
-        return requestResimplify();
-  }
+  // Make sure the instruction after the condition is the cond branch.
+  BasicBlock::iterator CondIt = ++Cond->getIterator();
 
-  // If this is a branch on a phi node in the current block, thread control
-  // through this block if any PHI node entries are constants.
-  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
-    if (PN->getParent() == BI->getParent())
-      if (FoldCondBranchOnPHI(BI, DL, Options.AC))
-        return requestResimplify();
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(CondIt))
+    ++CondIt;
 
-  // Scan predecessor blocks for conditional branches.
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-    if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
-      if (PBI != BI && PBI->isConditional())
-        if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
-          return requestResimplify();
+  if (&*CondIt != BI)
+    return false;
 
-  // Look for diamond patterns.
-  if (MergeCondStores)
-    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
-      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
-        if (PBI != BI && PBI->isConditional())
-          if (mergeConditionalStores(PBI, BI, DL, TTI))
-            return requestResimplify();
+  // Only allow this transformation if computing the condition doesn't involve
+  // too many instructions and these involved instructions can be executed
+  // unconditionally. We denote all involved instructions except the condition
+  // as "bonus instructions", and only allow this transformation when the
+  // number of the bonus instructions we'll need to create when cloning into
+  // each predecessor does not exceed a certain threshold.
+  unsigned NumBonusInsts = 0;
+  for (auto I = BB->begin(); Cond != &*I; ++I) {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
+      return false;
+    // I has only one use and can be executed unconditionally.
+    Instruction *User = dyn_cast<Instruction>(I->user_back());
+    if (User == nullptr || User->getParent() != BB)
+      return false;
+    // I is used in the same BB. Since BI uses Cond and doesn't have more slots
+    // to use any other instruction, User must be an instruction between next(I)
+    // and Cond.
 
-  return false;
-}
+    // Account for the cost of duplicating this instruction into each
+    // predecessor.
+    NumBonusInsts += PredCount;
+    // Early exits once we reach the limit.
+    if (NumBonusInsts > BonusInstThreshold)
+      return false;
+  }
 
-/// Check if passing a value to an instruction will cause undefined behavior.
-static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
-  Constant *C = dyn_cast<Constant>(V);
-  if (!C)
-    return false;
+  // Cond is known to be a compare or binary operator.  Check to make sure that
+  // neither operand is a potentially-trapping constant expression.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+    if (CE->canTrap())
+      return false;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+    if (CE->canTrap())
+      return false;
 
-  if (I->use_empty())
+  // Finally, don't infinitely unroll conditional loops.
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
+  if (TrueDest == BB || FalseDest == BB)
     return false;
 
-  if (C->isNullValue() || isa<UndefValue>(C)) {
-    // Only look at the first use, avoid hurting compile time with long uselists
-    User *Use = *I->user_begin();
-
-    // Now make sure that there are no instructions in between that can alter
-    // control flow (eg. calls)
-    for (BasicBlock::iterator
-             i = ++BasicBlock::iterator(I),
-             UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
-         i != UI; ++i)
-      if (i == I->getParent()->end() || i->mayHaveSideEffects())
-        return false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *PredBlock = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
 
-    // Look through GEPs. A load from a GEP derived from NULL is still undefined
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
-      if (GEP->getPointerOperand() == I)
-        return passingValueIsAlwaysUndefined(V, GEP);
+    // Check that we have two conditional branches.  If there is a PHI node in
+    // the common successor, verify that the same value flows in from both
+    // blocks.
+    SmallVector<PHINode *, 4> PHIs;
+    if (!PBI || PBI->isUnconditional() ||
+        (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) ||
+        (!BI->isConditional() &&
+         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
+      continue;
 
-    // Look through bitcasts.
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
-      return passingValueIsAlwaysUndefined(V, BC);
+    // Determine if the two branches share a common destination.
+    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
+    bool InvertPredCond = false;
 
-    // Load from null is undefined.
-    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
-      if (!LI->isVolatile())
-        return !NullPointerIsDefined(LI->getFunction(),
-                                     LI->getPointerAddressSpace());
+    if (BI->isConditional()) {
+      if (PBI->getSuccessor(0) == TrueDest) {
+        Opc = Instruction::Or;
+      } else if (PBI->getSuccessor(1) == FalseDest) {
+        Opc = Instruction::And;
+      } else if (PBI->getSuccessor(0) == FalseDest) {
+        Opc = Instruction::And;
+        InvertPredCond = true;
+      } else if (PBI->getSuccessor(1) == TrueDest) {
+        Opc = Instruction::Or;
+        InvertPredCond = true;
+      } else {
+        continue;
+      }
+    } else {
+      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
+        continue;
+    }
 
-    // Store to null is undefined.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
-      if (!SI->isVolatile())
-        return (!NullPointerIsDefined(SI->getFunction(),
-                                      SI->getPointerAddressSpace())) &&
-               SI->getPointerOperand() == I;
+    LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+    IRBuilder<> Builder(PBI);
 
-    // A call to null is undefined.
-    if (auto CS = CallSite(Use))
-      return !NullPointerIsDefined(CS->getFunction()) &&
-             CS.getCalledValue() == I;
-  }
-  return false;
-}
+    // If we need to invert the condition in the pred block to match, do so now.
+    if (InvertPredCond) {
+      Value *NewCond = PBI->getCondition();
 
-/// If BB has an incoming value that will always trigger undefined behavior
-/// (eg. null pointer dereference), remove the branch leading here.
-static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
-  for (PHINode &PHI : BB->phis())
-    for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
-      if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
-        Instruction *T = PHI.getIncomingBlock(i)->getTerminator();
-        IRBuilder<> Builder(T);
-        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
-          BB->removePredecessor(PHI.getIncomingBlock(i));
-          // Turn uncoditional branches into unreachables and remove the dead
-          // destination from conditional branches.
-          if (BI->isUnconditional())
-            Builder.CreateUnreachable();
-          else
-            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
-                                                       : BI->getSuccessor(0));
-          BI->eraseFromParent();
-          return true;
-        }
-        // TODO: SwitchInst.
+      if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
+        CmpInst *CI = cast<CmpInst>(NewCond);
+        CI->setPredicate(CI->getInversePredicate());
+      } else {
+        NewCond =
+            Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
       }
 
-  return false;
-}
-
-bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
-  bool Changed = false;
-
-  assert(BB && BB->getParent() && "Block not embedded in function!");
-  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+      PBI->setCondition(NewCond);
+      PBI->swapSuccessors();
+    }
 
-  // Remove basic blocks that have no predecessors (except the entry block)...
-  // or that just have themself as a predecessor.  These are unreachable.
-  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
-      BB->getSinglePredecessor() == BB) {
-    LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
-    DeleteDeadBlock(BB);
-    return true;
-  }
+    // If we have bonus instructions, clone them into the predecessor block.
+    // Note that there may be multiple predecessor blocks, so we cannot move
+    // bonus instructions to a predecessor block.
+    ValueToValueMapTy VMap; // maps original values to cloned values
+    // We already make sure Cond is the last instruction before BI. Therefore,
+    // all instructions before Cond other than DbgInfoIntrinsic are bonus
+    // instructions.
+    for (auto BonusInst = BB->begin(); Cond != &*BonusInst; ++BonusInst) {
+      if (isa<DbgInfoIntrinsic>(BonusInst))
+        continue;
+      Instruction *NewBonusInst = BonusInst->clone();
+      RemapInstruction(NewBonusInst, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      VMap[&*BonusInst] = NewBonusInst;
 
-  // Check to see if we can constant propagate this terminator instruction
-  // away...
-  Changed |= ConstantFoldTerminator(BB, true);
+      // If we moved a load, we cannot any longer claim any knowledge about
+      // its potential value. The previous information might have been valid
+      // only given the branch precondition.
+      // For an analogous reason, we must also drop all the metadata whose
+      // semantics we don't understand.
+      NewBonusInst->dropUnknownNonDebugMetadata();
 
-  // Check for and eliminate duplicate PHI nodes in this block.
-  Changed |= EliminateDuplicatePHINodes(BB);
+      PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
+      NewBonusInst->takeName(&*BonusInst);
+      BonusInst->setName(BonusInst->getName() + ".old");
+    }
 
-  // Check for and remove branches that will always cause undefined behavior.
-  Changed |= removeUndefIntroducingPredecessor(BB);
+    // Clone Cond into the predecessor basic block, and or/and the
+    // two conditions together.
+    Instruction *CondInPred = Cond->clone();
+    RemapInstruction(CondInPred, VMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    PredBlock->getInstList().insert(PBI->getIterator(), CondInPred);
+    CondInPred->takeName(Cond);
+    Cond->setName(CondInPred->getName() + ".old");
 
-  // Merge basic blocks into their predecessor if there is only one distinct
-  // pred, and if there is only one distinct successor of the predecessor, and
-  // if there are no PHI nodes.
-  if (MergeBlockIntoPredecessor(BB))
-    return true;
+    if (BI->isConditional()) {
+      Instruction *NewCond = cast<Instruction>(
+          Builder.CreateBinOp(Opc, PBI->getCondition(), CondInPred, "or.cond"));
+      PBI->setCondition(NewCond);
 
-  if (SinkCommon && Options.SinkCommonInsts)
-    Changed |= SinkCommonCodeFromPredecessors(BB);
+      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+      bool HasWeights =
+          extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                                 SuccTrueWeight, SuccFalseWeight);
+      SmallVector<uint64_t, 8> NewWeights;
 
-  IRBuilder<> Builder(BB);
+      if (PBI->getSuccessor(0) == BB) {
+        if (HasWeights) {
+          // PBI: br i1 %x, BB, FalseDest
+          // BI:  br i1 %y, TrueDest, FalseDest
+          // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+          // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          //               TrueWeight for PBI * FalseWeight for BI.
+          // We assume that total weights of a BranchInst can fit into 32 bits.
+          // Therefore, we will not have overflow using 64-bit arithmetic.
+          NewWeights.push_back(PredFalseWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredTrueWeight * SuccFalseWeight);
+        }
+        AddPredecessorToBlock(TrueDest, PredBlock, BB, MSSAU);
+        PBI->setSuccessor(0, TrueDest);
+      }
+      if (PBI->getSuccessor(1) == BB) {
+        if (HasWeights) {
+          // PBI: br i1 %x, TrueDest, BB
+          // BI:  br i1 %y, TrueDest, FalseDest
+          // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          //              FalseWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredFalseWeight * SuccTrueWeight);
+          // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+        }
+        AddPredecessorToBlock(FalseDest, PredBlock, BB, MSSAU);
+        PBI->setSuccessor(1, FalseDest);
+      }
+      if (NewWeights.size() == 2) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(NewWeights);
 
-  // If there is a trivial two-entry PHI node in this basic block, and we can
-  // eliminate it, do so now.
-  if (auto *PN = dyn_cast<PHINode>(BB->begin()))
-    if (PN->getNumIncomingValues() == 2)
-      Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),
+                                           NewWeights.end());
+        setBranchWeights(PBI, MDWeights[0], MDWeights[1]);
+      } else
+        PBI->setMetadata(LLVMContext::MD_prof, nullptr);
+    } else {
+      // Update PHI nodes in the common successors.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+        ConstantInt *PBI_C = cast<ConstantInt>(
+            PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+        assert(PBI_C->getType()->isIntegerTy(1));
+        Instruction *MergedCond = nullptr;
+        if (PBI->getSuccessor(0) == TrueDest) {
+          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
+          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
+          //       is false: !PBI_Cond and BI_Value
+          Instruction *NotCond = cast<Instruction>(
+              Builder.CreateNot(PBI->getCondition(), "not.cond"));
+          MergedCond = cast<Instruction>(Builder.CreateBinOp(
+              Instruction::And, NotCond, CondInPred, "and.cond"));
+          if (PBI_C->isOne())
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, PBI->getCondition(), MergedCond, "or.cond"));
+        } else {
+          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
+          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
+          //       is false: PBI_Cond and BI_Value
+          MergedCond = cast<Instruction>(Builder.CreateBinOp(
+              Instruction::And, PBI->getCondition(), CondInPred, "and.cond"));
+          if (PBI_C->isOne()) {
+            Instruction *NotCond = cast<Instruction>(
+                Builder.CreateNot(PBI->getCondition(), "not.cond"));
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, NotCond, MergedCond, "or.cond"));
+          }
+        }
+        // Update PHI Node.
+        PHIs[i]->setIncomingValueForBlock(PBI->getParent(), MergedCond);
+      }
 
-  Instruction *Terminator = BB->getTerminator();
-  Builder.SetInsertPoint(Terminator);
-  switch (Terminator->getOpcode()) {
-  case Instruction::Br:
-    Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder);
-    break;
-  case Instruction::Ret:
-    Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder);
-    break;
-  case Instruction::Resume:
-    Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder);
-    break;
-  case Instruction::CleanupRet:
-    Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator));
-    break;
-  case Instruction::Switch:
-    Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder);
-    break;
-  case Instruction::Unreachable:
-    Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator));
-    break;
-  case Instruction::IndirectBr:
-    Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator));
-    break;
-  }
+      // PBI is changed to branch to TrueDest below. Remove itself from
+      // potential phis from all other successors.
+      if (MSSAU)
+        MSSAU->changeCondBranchToUnconditionalTo(PBI, TrueDest);
 
-  return Changed;
-}
+      // Change PBI from Conditional to Unconditional.
+      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
+      EraseTerminatorAndDCECond(PBI, MSSAU, AC);
+      PBI = New_PBI;
+    }
 
-bool SimplifyCFGOpt::run(BasicBlock *BB) {
-  bool Changed = false;
+    // If BI was a loop latch, it may have had associated loop metadata.
+    // We need to copy it to the new latch, that is, PBI.
+    if (MDNode *LoopMD = BI->getMetadata(LLVMContext::MD_loop))
+      PBI->setMetadata(LLVMContext::MD_loop, LoopMD);
 
-  // Repeated simplify BB as long as resimplification is requested.
-  do {
-    Resimplify = false;
+    // TODO: If BB is reachable from all paths through PredBlock, then we
+    // could replace PBI's branch probabilities with BI's.
 
-    // Perform one round of simplifcation. Resimplify flag will be set if
-    // another iteration is requested.
-    Changed |= simplifyOnce(BB);
-  } while (Resimplify);
+    // Copy any debug value intrinsics into the end of PredBlock.
+    for (Instruction &I : *BB)
+      if (isa<DbgInfoIntrinsic>(I))
+        I.clone()->insertBefore(PBI);
 
-  return Changed;
+    return true;
+  }
+  return false;
 }
 
 bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
-; RUN: opt -S -passes=attributor -aa-pipeline='basic-aa' -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s
+; RUN: opt -S -passes=attributor -aa-pipeline='basic-aa' -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s --check-prefixes=CHECK,NO_ASSUME
+; RUN: opt -S -passes=attributor --enable-knowledge-retention -aa-pipeline='basic-aa' -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s --check-prefixes=CHECK,USE_ASSUME
 
 %struct.ss = type { i32, i64 }
 
@@ -15,15 +16,26 @@
 }
 
 define i32 @test(i32* %X) {
-; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32* nocapture nofree readonly align 4 [[X:%.*]])
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
-; CHECK-NEXT:    ret i32 0
+; NO_ASSUME-LABEL: define {{[^@]+}}@test
+; NO_ASSUME-SAME: (i32* nocapture nofree readonly align 4 [[X:%.*]])
+; NO_ASSUME-NEXT:  entry:
+; NO_ASSUME-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; NO_ASSUME-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; NO_ASSUME-NEXT:    store i32 1, i32* [[TMP1]], align 8
+; NO_ASSUME-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
+; NO_ASSUME-NEXT:    store i64 2, i64* [[TMP4]], align 4
+; NO_ASSUME-NEXT:    ret i32 0
+;
+; USE_ASSUME-LABEL: define {{[^@]+}}@test
+; USE_ASSUME-SAME: (i32* nocapture nofree readonly align 4 [[X:%.*]])
+; USE_ASSUME-NEXT:  entry:
+; USE_ASSUME-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; USE_ASSUME-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; USE_ASSUME-NEXT:    store i32 1, i32* [[TMP1]], align 8
+; USE_ASSUME-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
+; USE_ASSUME-NEXT:    store i64 2, i64* [[TMP4]], align 4
+; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[X]], i64 4), "align"(%struct.ss* [[S]], i64 8), "dereferenceable"(i32* [[X]], i64 4), "dereferenceable"(%struct.ss* [[S]], i64 12), "nonnull"(%struct.ss* [[S]]), "nonnull"(i32* [[X]]) ]
+; USE_ASSUME-NEXT:    ret i32 0
 ;
 entry:
   %S = alloca %struct.ss
diff --git a/llvm/test/Transforms/InstSimplify/load.ll b/llvm/test/Transforms/InstSimplify/load.ll
--- a/llvm/test/Transforms/InstSimplify/load.ll
+++ b/llvm/test/Transforms/InstSimplify/load.ll
@@ -1,20 +1,29 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
-; RUN: opt < %s -instsimplify -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s  --check-prefixes=CHECK,NO_ASSUME
+; RUN: opt < %s -instsimplify --enable-knowledge-retention -S | FileCheck %s  --check-prefixes=CHECK,USE_ASSUME
 
 @zeroinit = constant {} zeroinitializer
 @undef = constant {} undef
 
 define i32 @crash_on_zeroinit() {
-; CHECK-LABEL: @crash_on_zeroinit(
-; CHECK:         ret i32 0
+; NO_ASSUME-LABEL: @crash_on_zeroinit(
+; NO_ASSUME-NEXT:    ret i32 0
+;
+; USE_ASSUME-LABEL: @crash_on_zeroinit(
+; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(i32* bitcast ({}* @zeroinit to i32*), i64 4), "nonnull"(i32* bitcast ({}* @zeroinit to i32*)) ]
+; USE_ASSUME-NEXT:    ret i32 0
 ;
   %load = load i32, i32* bitcast ({}* @zeroinit to i32*)
   ret i32 %load
 }
 
 define i32 @crash_on_undef() {
-; CHECK-LABEL: @crash_on_undef(
-; CHECK:         ret i32 undef
+; NO_ASSUME-LABEL: @crash_on_undef(
+; NO_ASSUME-NEXT:    ret i32 undef
+;
+; USE_ASSUME-LABEL: @crash_on_undef(
+; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(i32* bitcast ({}* @undef to i32*), i64 4), "nonnull"(i32* bitcast ({}* @undef to i32*)) ]
+; USE_ASSUME-NEXT:    ret i32 undef
 ;
   %load = load i32, i32* bitcast ({}* @undef to i32*)
   ret i32 %load
@@ -23,8 +32,13 @@
 @GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49]
 
 define <8 x i32> @partial_load() {
-; CHECK-LABEL: @partial_load(
-; CHECK:         ret <8 x i32> <i32 0, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48>
+; NO_ASSUME-LABEL: @partial_load(
+; NO_ASSUME-NEXT:    ret <8 x i32> <i32 0, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48>
+;
+; USE_ASSUME-LABEL: @partial_load(
+; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -1) to <8 x i32>*), i64 32), "nonnull"(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -1) to <8 x i32>*)) ]
+; USE_ASSUME-NEXT:    ret <8 x i32> <i32 0, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48>
+;
   %load = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -1) to <8 x i32>*)
   ret <8 x i32> %load
 }
@@ -33,8 +47,13 @@
 
 ; This does an out of bounds load from the global constant
 define <3 x float> @load_vec3() {
-; CHECK-LABEL: @load_vec3(
-; CHECK-NEXT:    ret <3 x float> undef
+; NO_ASSUME-LABEL: @load_vec3(
+; NO_ASSUME-NEXT:    ret <3 x float> undef
+;
+; USE_ASSUME-LABEL: @load_vec3(
+; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(<3 x float>* getelementptr inbounds (<3 x float>, <3 x float>* @constvec, i64 1), i64 12), "nonnull"(<3 x float>* getelementptr inbounds (<3 x float>, <3 x float>* @constvec, i64 1)) ]
+; USE_ASSUME-NEXT:    ret <3 x float> undef
+;
   %1 = load <3 x float>, <3 x float>* getelementptr inbounds (<3 x float>, <3 x float>* @constvec, i64 1)
   ret <3 x float> %1
 }
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -25,12 +25,18 @@
 
 TEST(Local, RecursivelyDeleteDeadPHINodes) {
   LLVMContext C;
+  auto M = std::make_unique<Module>("test", C);
 
   IRBuilder<> builder(C);
 
+  Function *F = Function::Create(FunctionType::get(Type::getVoidTy(C), false),
+                                 GlobalValue::ExternalLinkage, "", &*M);
+
   // Make blocks
   BasicBlock *bb0 = BasicBlock::Create(C);
   BasicBlock *bb1 = BasicBlock::Create(C);
+  bb0->insertInto(F);
+  bb1->insertInto(F);
 
   builder.SetInsertPoint(bb0);
   PHINode    *phi = builder.CreatePHI(Type::getInt32Ty(C), 2);
@@ -59,11 +65,6 @@
   builder.CreateAdd(phi, phi);
 
   EXPECT_TRUE(RecursivelyDeleteDeadPHINode(phi));
-
-  bb0->dropAllReferences();
-  bb1->dropAllReferences();
-  delete bb0;
-  delete bb1;
 }
 
 TEST(Local, RemoveDuplicatePHINodes) {