Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -126,6 +126,7 @@
 void initializeEfficiencySanitizerPass(PassRegistry&);
 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry &);
 void initializeGVNHoistLegacyPassPass(PassRegistry &);
+void initializeGVNSinkLegacyPassPass(PassRegistry &);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -333,6 +333,12 @@
 
 //===----------------------------------------------------------------------===//
 //
+// JMJM
+//
+FunctionPass *createGVNSinkPass();
+
+//===----------------------------------------------------------------------===//
+//
 // MergedLoadStoreMotion - This pass merges loads and stores in diamonds. Loads
 // are hoisted into the header, while stores sink into the footer.
 //
Index: include/llvm/Transforms/Scalar/GVN.h
===================================================================
--- include/llvm/Transforms/Scalar/GVN.h
+++ include/llvm/Transforms/Scalar/GVN.h
@@ -64,6 +64,7 @@
   /// as an efficient mechanism to determine the expression-wise equivalence of
   /// two values.
   class ValueTable {
+  protected:
     DenseMap<Value *, uint32_t> valueNumbering;
     DenseMap<Expression, uint32_t> expressionNumbering;
     AliasAnalysis *AA;
@@ -72,17 +73,20 @@
 
     uint32_t nextValueNumber;
 
-    Expression createExpr(Instruction *I);
-    Expression createCmpExpr(unsigned Opcode, CmpInst::Predicate Predicate,
-                             Value *LHS, Value *RHS);
-    Expression createExtractvalueExpr(ExtractValueInst *EI);
-    uint32_t lookupOrAddCall(CallInst *C);
-
+    virtual Expression createExpr(Instruction *I);
+    virtual Expression createCmpExpr(unsigned Opcode, CmpInst::Predicate Predicate,
+                                     Value *LHS, Value *RHS);
+    virtual Expression createExtractvalueExpr(ExtractValueInst *EI);
+    virtual uint32_t lookupOrAddCall(CallInst *C);
+    virtual uint32_t lookupOrAddLoad(LoadInst *LI);
+    virtual uint32_t lookupOrAddStore(StoreInst *LI);
+    void modify(Value *V, uint32_t num);
+    
   public:
     ValueTable();
     ValueTable(const ValueTable &Arg);
     ValueTable(ValueTable &&Arg);
-    ~ValueTable();
+    virtual ~ValueTable();
 
     uint32_t lookupOrAdd(Value *V);
     uint32_t lookup(Value *V) const;
@@ -100,6 +104,25 @@
     void verifyRemoved(const Value *) const;
   };
 
+  /// This map computes a corrolary function to ValueTable. Where ValueTable
+  /// computes a value number using a function of operation & transitive
+  /// operands, this uses a function of operation & transitive *uses*.
+  ///
+  /// FIXME: Give a fuller description here.
+  class PostValueTable : public ValueTable {
+    Expression createExpr(Instruction *I) override;
+    Expression createCmpExpr(unsigned Opcode, CmpInst::Predicate Predicate,
+                             Value *LHS, Value *RHS) override;
+    Expression createExtractvalueExpr(ExtractValueInst *EI) override;
+    uint32_t lookupOrAddCall(CallInst *C) override;
+    uint32_t lookupOrAddLoad(LoadInst *LI) override;
+    uint32_t lookupOrAddStore(StoreInst *LI) override;
+    bool IgnoreMemDeps = false;
+  public:
+    void setIgnoreMemoryDependencies(bool Ignore) { IgnoreMemDeps = Ignore; }
+    ~PostValueTable() override;
+  };
+
 private:
   friend class gvn::GVNLegacyPass;
   friend struct DenseMapInfo<Expression>;
@@ -234,6 +257,11 @@
   /// \brief Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
+/// JM
+struct GVNSinkPass : PassInfoMixin<GVNSinkPass> {
+  /// \brief Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
 
 }
 
Index: include/llvm/Transforms/Utils/MemorySSA.h
===================================================================
--- include/llvm/Transforms/Utils/MemorySSA.h
+++ include/llvm/Transforms/Utils/MemorySSA.h
@@ -477,6 +477,7 @@
   MemorySSA(Function &, AliasAnalysis *, DominatorTree *);
   ~MemorySSA();
 
+  void invalidateAll();
   MemorySSAWalker *getWalker();
 
   /// \brief Given a memory Mod/Ref'ing instruction, get the MemorySSA
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -146,6 +146,10 @@
     "enable-gvn-hoist", cl::init(true), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass (default = on)"));
 
+static cl::opt<bool> EnableGVNSink(
+    "enable-gvn-sink", cl::init(true), cl::Hidden,
+    cl::desc("Enable the GVN sinking pass (default = on)"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -247,6 +251,8 @@
   FPM.add(createEarlyCSEPass());
   if(EnableGVNHoist)
     FPM.add(createGVNHoistPass());
+  if(EnableGVNSink)
+    FPM.add(createGVNSinkPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -13,6 +13,7 @@
   GuardWidening.cpp
   GVN.cpp
   GVNHoist.cpp
+  GVNSink.cpp
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
Index: lib/Transforms/Scalar/GVN.cpp
===================================================================
--- lib/Transforms/Scalar/GVN.cpp
+++ lib/Transforms/Scalar/GVN.cpp
@@ -334,6 +334,86 @@
 }
 
 //===----------------------------------------------------------------------===//
+//                   PostValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+GVN::PostValueTable::~PostValueTable() {
+}
+
+GVN::Expression GVN::PostValueTable::createExpr(Instruction *I) {
+  Expression e;
+  e.type = I->getType();
+  e.opcode = I->getOpcode();
+  for (auto &U : I->uses())
+    e.varargs.push_back(lookupOrAdd(U.getUser()));
+  std::sort(e.varargs.begin(), e.varargs.end());
+
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    CmpInst::Predicate Predicate = C->getPredicate();
+    e.opcode = (C->getOpcode() << 8) | Predicate;
+  } else if (isa<InsertValueInst>(I)) {
+    assert(0 && "Implement!");
+  }
+
+  return e;
+}
+
+GVN::Expression GVN::PostValueTable::createCmpExpr(unsigned Opcode,
+                                                   CmpInst::Predicate Predicate,
+                                                   Value *LHS, Value *RHS) {
+  llvm_unreachable("Not implemented!");
+}
+
+GVN::Expression GVN::PostValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+  return createExpr(EI);
+}
+
+uint32_t GVN::PostValueTable::lookupOrAddCall(CallInst *I) {
+  if (AA->doesNotAccessMemory(I)) {  
+    Expression exp = createExpr(I);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[I] = e;
+    return e;
+  } else if (IgnoreMemDeps) {
+    Expression exp = createExpr(I);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[I] = e;
+    return e;
+  } else {
+    valueNumbering[I] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+uint32_t GVN::PostValueTable::lookupOrAddLoad(LoadInst *I) {
+  if (IgnoreMemDeps) {
+    Expression exp = createExpr(I);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[I] = e;
+    return e;
+  } else {
+    valueNumbering[I] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+uint32_t GVN::PostValueTable::lookupOrAddStore(StoreInst *I) {
+  if (IgnoreMemDeps) {
+    Expression exp = createExpr(I);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[I] = e;
+    return e;
+  } else {
+    valueNumbering[I] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+//===----------------------------------------------------------------------===//
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
 
@@ -354,6 +434,10 @@
   valueNumbering.insert(std::make_pair(V, num));
 }
 
+void GVN::ValueTable::modify(Value *V, uint32_t num) {
+  valueNumbering[V] = num;
+}
+
 uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = createExpr(C);
@@ -463,6 +547,16 @@
   }
 }
 
+uint32_t GVN::ValueTable::lookupOrAddLoad(LoadInst *LI) {
+  valueNumbering[LI] = nextValueNumber;
+  return nextValueNumber++;
+}
+
+uint32_t GVN::ValueTable::lookupOrAddStore(StoreInst *SI) {
+  valueNumbering[SI] = nextValueNumber;
+  return nextValueNumber++;
+}
+
 /// Returns true if a value number exists for the specified value.
 bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
 
@@ -483,6 +577,10 @@
   switch (I->getOpcode()) {
     case Instruction::Call:
       return lookupOrAddCall(cast<CallInst>(I));
+    case Instruction::Load:
+      return lookupOrAddLoad(cast<LoadInst>(I));
+    case Instruction::Store:
+      return lookupOrAddStore(cast<StoreInst>(I));
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
Index: lib/Transforms/Scalar/GVNSink.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/GVNSink.cpp
@@ -0,0 +1,813 @@
+//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass attempts to sink instructions into successors, reducing static
+// instruction count and enabling if-conversion.
+//
+// We use a variant of global value numbering to decide what can be sunk.
+// Consider:
+//
+// [ %a1 = add i32 %b, 1  ]   [ %c1 = add i32 %d, 1  ]
+// [ %a2 = xor i32 %a1, 1 ]   [ %c2 = xor i32 %c1, 1 ]
+//                  \           /
+//            [ %e = phi i32 %a2, %c2 ]
+//            [ add i32 %e, 4         ]
+//
+//
+// GVN would number %a1 and %c1 differently because they compute different
+// results - the VN of an instruction is a function of its opcode and the
+// transitive closure of its operands. This is the key property for hoisting
+// and CSE.
+//
+// What we want when sinking however is for a numbering that is a function of
+// the *uses* of an instruction, which allows us to answer the question "if I
+// replace %a1 with %c1, will it contribute in an equivalent way to all
+// successive instructions?". The PostValueTable class in GVN provides this
+// mapping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include <unordered_set>
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace {
+
+// FIXME: Invoke
+static bool isMemoryInst(Instruction *I) {
+  return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+         (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+// LockstepReverseIterator - Iterates through instructions in a set of blocks in
+// reverse order from the first non-terminator.
+// For example (assume all blocks have size n):
+//   LockstepReverseIterator I([B1, B2, B3]);
+//   *I-- = [B1[n], B2[n], B3[n]];
+//   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+//   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+//   ...
+//
+// It continues until all blocks have been exhausted. Use \c getActiveBlocks() to
+// determine which blocks are still going and the order they appear in the
+// list returned by operator*.
+class LockstepReverseIterator {
+  ArrayRef<BasicBlock*> Blocks;
+  SmallPtrSet<BasicBlock*, 4> ActiveBlocks;
+  SmallVector<Instruction*,4> Insts;
+  bool Fail;
+public:
+  LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+    reset();
+  }
+
+  void reset() {
+    Fail = false;
+    ActiveBlocks.clear();
+    for (auto *BB : Blocks)
+      ActiveBlocks.insert(BB);
+    Insts.clear();
+    for (auto *BB : Blocks) {
+      if (BB->size() <= 1) {
+        // Block wasn't big enough - only contained a terminator.
+        ActiveBlocks.erase(BB);
+        continue;
+      }
+      Insts.push_back(BB->getTerminator()->getPrevNode());
+    }
+    if (Insts.empty())
+      Fail = true;
+  }
+
+  bool isValid() const { return !Fail; }
+  ArrayRef<Instruction *> operator*() const { return Insts; }
+  SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+  
+  void restrictToBlocks(SmallPtrSetImpl<BasicBlock*> &Blocks) {
+    auto II = Insts.begin();
+    for (; II != Insts.end();) {
+      if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
+          Blocks.end()) {
+        ActiveBlocks.erase((*II)->getParent());
+        II = Insts.erase(II);
+      } else {
+        ++II;
+      }
+    }
+  }
+  
+  void operator -- () {
+    if (Fail)
+      return;
+    SmallVector<Instruction*,4> NewInsts;
+    for (auto *&Inst : Insts) {
+      if (Inst == &Inst->getParent()->front())
+        ActiveBlocks.erase(Inst->getParent());
+      else
+        NewInsts.push_back(Inst->getPrevNode());
+    }
+    if (NewInsts.empty()) {
+      Fail = true;
+      return;
+    }
+    Insts = NewInsts;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+  unsigned NumBlocks;
+  unsigned NumInstructions;
+  unsigned NumPHIs;
+  unsigned NumMemoryInsts;
+  int Cost = -1;
+  SmallVector<BasicBlock*, 4> Blocks;
+
+  void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+    unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+    unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+    Cost = (NumInstructions * (NumBlocks - 1)) -
+           (NumExtraPHIs *
+            NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+           - SplitEdgeCost;
+  }
+  bool operator<(const SinkingInstructionCandidate &Other) const {
+    return Cost < Other.Cost;
+  }
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              SinkingInstructionCandidate &C) {
+  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+  SmallVector<Value*, 4> Values;
+  SmallVector<BasicBlock*, 4> Blocks;
+public:
+  ModelledPHI() {}
+  ModelledPHI(PHINode *PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+      Blocks.push_back(PN->getIncomingBlock(I));
+    std::stable_sort(Blocks.begin(), Blocks.end());
+
+    // This assumes the PHI is already well-formed and there aren't conflicting
+    // incoming values for the same block.
+    for (auto *B : Blocks)
+      Values.push_back(PN->getIncomingValueForBlock(B));
+  }
+
+  // Create a PHI from an array of incoming values and incoming blocks.
+  template <typename VArray, typename BArray>
+  ModelledPHI(const VArray &V, const BArray &B) {
+    std::copy(V.begin(), V.end(), std::back_inserter(Values));
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+  }
+
+  // Create a PHI from [I[OpNum] for I in Insts].
+  template <typename BArray>
+  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+    for (auto *I : Insts)
+      Values.push_back(I->getOperand(OpNum));
+  }
+  
+  // Restrict the PHI's contents down to only \c NewBlocks.
+  // \c NewBlocks must be a subset of \c this->Blocks.
+  void restrictToBlocks(const SmallPtrSetImpl<BasicBlock*> &NewBlocks) {
+    auto BI = Blocks.begin();
+    auto VI = Values.begin();
+    while (BI != Blocks.end()) {
+      assert(VI != Values.end());
+      if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
+          NewBlocks.end()) {
+        BI = Blocks.erase(BI);
+        VI = Values.erase(VI);
+      } else {
+        ++BI;
+        ++VI;
+      }
+    }
+    assert(Blocks.size() == NewBlocks.size());
+  }
+
+  ArrayRef<Value *> getValues() const { return Values; }
+
+  bool areAllIncomingValuesSame() const {
+    return all_of(Values, [&](Value *V) { return V == Values[0]; });
+  }
+  
+  // Hash functor
+  size_t operator () (const ModelledPHI &P) const {
+    return hash_combine_range(P.Values.begin(), P.Values.end());
+  }
+  bool operator == (const ModelledPHI &Other) const {
+    return Values == Other.Values && Blocks == Other.Blocks;
+  }
+};
+typedef std::unordered_set<ModelledPHI, /*Hash=*/ModelledPHI> ModelledPHISet;
+
+//===----------------------------------------------------------------------===//
+
+/// This is a modified version of GVN's PostValueTable. PostValueTable provides
+/// a value numbering based on how a value is *used* rather than what it uses.
+/// We extend this here to be able to reason about memory-touching operations.
+///
+/// In general it is difficult for GVN to reason about memory ordering, so it
+/// returns conservative results. However, we know that we will only ever be
+/// looking at instructions in blocks that share a common immediate successor.
+/// This allows us to know that the memory state at the end of the blocks will
+/// be identical.
+///
+/// Not only this, but we know that we will only sink instructions that are
+/// equivalent, so to reason about a memory instruction I it is sufficient to
+/// count the number of memory defs following I.
+///
+class ValueTable : private GVN::PostValueTable {
+  MemorySSA *MSSA;
+  enum ValueNumberSpecialCases : uint32_t {
+    VN_Volatile = 1U << 31
+  };
+public:
+  typedef GVN::PostValueTable super;
+  using super::clear;
+  using super::lookup;
+
+  ValueTable(MemorySSA *MSSA, AliasAnalysis *AA) : MSSA(MSSA) {
+    setAliasAnalysis(AA);
+    setIgnoreMemoryDependencies(true);
+  }
+
+  uint32_t lookupOrAdd(Instruction *I) {
+    // We search in postorder, so we must not have seen this instruction yet.
+    // This is important as we modify the value number for memory-modifying
+    // instructions.
+    if (exists(I))
+      return lookup(I);
+    uint32_t N = super::lookupOrAdd(I);
+
+    // Volatility is not accounted for in the value numbering, so add it
+    // here. We don't support memory ops that are anything other than
+    // simple or volatile (no memory ordering or atomicity).
+    if (auto *SI = dyn_cast<StoreInst>(I)) {
+      if (isStrongerThanUnordered(SI->getOrdering()) || SI->isAtomic())
+        return ~0U;
+      if (SI->isVolatile())
+        N |= VN_Volatile;
+      
+    } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+      
+      if (isStrongerThanUnordered(LI->getOrdering()) || LI->isAtomic())
+        // Not supported.
+        return ~0U;
+      if (LI->isVolatile())
+        N |= VN_Volatile;
+    }
+
+    if (isMemoryInst(I))
+      N |= getMemoryUseOrder(I) << 24;
+
+    modify(I, N);
+    return N;
+  }
+
+  // Given a memory-using instruction in Inst, return an ID describing the
+  // memory state at \c Inst.
+  // Because we look for sinking opportunities bottom-up, this returns
+  // the number of MemoryDefs after Inst.
+  uint8_t getMemoryUseOrder(const Instruction *Inst) {
+    const auto *AL = MSSA->getBlockAccesses(Inst->getParent());
+    const auto *MA = MSSA->getMemoryAccess(Inst);
+    assert(AL && MA && "No memory access?");
+
+    auto I = std::find_if(AL->begin(), AL->end(),
+                          [&MA](const MemoryAccess &A) { return &A == MA; });
+    unsigned N = 0;
+    for (; I != AL->end(); ++I) {
+      if (!isa<MemoryUse>(*I))
+        ++N;
+    }
+    return N;
+  }
+
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+  GVNSink(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
+          MemoryDependenceResults *MD, MemorySSA *MSSA,
+          TargetTransformInfo *TTI, bool OptForMinSize)
+    : VN(MSSA, AA), DT(DT), PDT(PDT), MD(MD), MSSA(MSSA), TTI(TTI),
+        OptForMinSize(OptForMinSize) {}
+  bool run(Function &F) {
+
+    (void)TTI;
+    (void)OptForMinSize;
+
+    DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+    
+    // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+    bool Res = false;
+    while (true) {
+      auto State = sinkExpressions();
+      if (State.first == 0)
+        return Res;
+
+      if (State.second) {
+        // To address a limitation of the current GVN, we need to rerun the
+        // hoisting after we hoisted loads or stores in order to be able to
+        // hoist all scalars dependent on the hoisted ld/st.
+        VN.clear();
+        MSSA->invalidateAll();
+      }
+      Res = true;
+    }
+
+    return Res;
+  }
+private:
+  ValueTable VN;
+  DominatorTree *DT;
+  PostDominatorTree *PDT;
+  MemoryDependenceResults *MD;
+  MemorySSA *MSSA;
+  const TargetTransformInfo *TTI;
+  const bool OptForMinSize;
+  
+  bool isInstructionBlacklisted(Instruction *I) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return true;
+    // FIXME: We don't support invokes yet.
+    if (isa<InvokeInst>(I))
+      return true;
+    return false;
+  }
+
+  // Is it legal to place a variable in operand \c OpIdx of \c I?
+  // FIXME: This should be promoted to Instruction.
+  bool canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
+    // We can't have a PHI with a metadata type.
+    if (I->getOperand(OpIdx)->getType()->isMetadataTy())
+      return false;
+
+    // Early exit.
+    if (!isa<Constant>(I->getOperand(OpIdx)))
+      return true;
+
+    switch (I->getOpcode()) {
+    default:
+      return true;
+    case Instruction::Call:
+    case Instruction::Invoke:
+      // FIXME: many arithmetic intrinsics have no issue taking a
+      // variable, however it's hard to distingish these from
+      // specials such as @llvm.frameaddress that require a constant.
+      return !isa<IntrinsicInst>(I);
+    case Instruction::ShuffleVector:
+      // Shufflevector masks are constant.
+      return OpIdx != 2;
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+      // All operands apart from the first are constant.
+      return OpIdx == 0;
+    case Instruction::Alloca:
+      return false;
+    case Instruction::GetElementPtr:
+      if (OpIdx == 0)
+        return true;
+      gep_type_iterator It = std::next(gep_type_begin(I), OpIdx - 1);
+      return !It->isStructTy();
+    }
+  }
+
+  // The main heuristic function. Analyze the set of instructions pointed to by
+  // LRI and return a candidate solution if these instructions can be sunk, or
+  // None otherwise.
+  Optional<SinkingInstructionCandidate>
+  analyzeInstructionForSinking(LockstepReverseIterator &LRI,
+                               unsigned &InstNum, unsigned &MemoryInstNum,
+                               ModelledPHISet &NeededPHIs,
+                               std::set<Value*> &PHIContents) {
+    auto Insts = *LRI;
+    DEBUG(
+      dbgs() << " -- Analyzing instruction set: [\n";
+      for (auto *I : Insts) {
+        I->dump();
+      }
+      dbgs() << " ]\n";
+      );
+
+    DenseMap<uint32_t, unsigned> VNums;
+    for (auto *I : Insts) {
+      uint32_t N = VN.lookupOrAdd(I);
+      DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n");
+      if (N == ~0U)
+        return None;
+      VNums[N]++;
+    }
+    unsigned VNumToSink = std::max_element(
+        VNums.begin(), VNums.end(), [](const std::pair<uint32_t, unsigned> &I,
+                                       const std::pair<uint32_t, unsigned> &J) {
+          return I.second < J.second;
+        })->first;
+    
+    if (VNums[VNumToSink] == 1)
+      // Can't sink anything!
+      return None;
+
+    // Now restrict the number of incoming blocks down to only those with
+    // VNumToSink.
+    auto &ActivePreds = LRI.getActiveBlocks();
+    unsigned InitialActivePredSize = ActivePreds.size();
+    SmallVector<Instruction*, 4> NewInsts;
+    for (auto *I : Insts) {
+      if (VN.lookup(I) != VNumToSink)
+        ActivePreds.erase(I->getParent());
+      else
+        NewInsts.push_back(I);
+    }
+    for (auto *I : NewInsts)
+      if (isInstructionBlacklisted(I))
+        return None;
+
+    // If we've restricted the incoming blocks, restrict all needed PHIs also
+    // to that set.
+    // FIXME: Make a testcase for this!
+    bool RecomputePHIContents = false;
+    if (ActivePreds.size() != InitialActivePredSize) {
+      ModelledPHISet NewNeededPHIs;
+      for (auto P : NeededPHIs) {
+        P.restrictToBlocks(ActivePreds);
+        NewNeededPHIs.insert(P);
+      }
+      NeededPHIs = NewNeededPHIs;
+      LRI.restrictToBlocks(ActivePreds);
+      RecomputePHIContents = true;
+    }
+
+    // The sunk instruction's results.
+    ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+    // Does sinking this instruction render previous PHIs redundant?
+    if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
+      NeededPHIs.erase(NewPHI);
+      RecomputePHIContents = true;
+    }
+
+    if (RecomputePHIContents) {
+      // The needed PHIs have changed, so recompute the set of all needed
+      // values.
+      PHIContents.clear();
+      for (auto &PHI : NeededPHIs)
+        PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+    }
+
+    // Is this instruction required by a later PHI that doesn't match this PHI?
+    // if so, we can't sink this instruction.
+    for (auto *V : NewPHI.getValues())
+	if (PHIContents.count(V))
+          // V exists in this PHI, but the whole PHI is different to NewPHI
+          // (else it would have been removed earlier). We cannot continue
+          // because this isn't representable.
+          return None;
+
+    // Which operands need PHIs?  FIXME: If any of these fail, we should
+    // partition up the candidates to try and continue making progress.
+    Instruction *I0 = NewInsts[0];
+    for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+      ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+      if (PHI.areAllIncomingValuesSame())
+        continue;
+      if (NeededPHIs.count(PHI))
+        continue;
+      if (!canReplaceOperandWithVariable(I0, OpNum))
+        // We can 't create a PHI from this instruction!
+        return None;
+      // Don't create indirect calls !The called value is the final operand.
+      if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1)
+        // FIXME : if the call was *already* indirect, we should do this.
+        return None;
+
+      NeededPHIs.insert(PHI);
+    }
+
+    if (isMemoryInst(NewInsts[0]))
+      ++MemoryInstNum;
+
+    SinkingInstructionCandidate Cand;
+    Cand.NumInstructions = ++InstNum;
+    Cand.NumMemoryInsts = MemoryInstNum;
+    Cand.NumBlocks = ActivePreds.size();
+    Cand.NumPHIs = NeededPHIs.size();
+    for (auto *C : ActivePreds)
+      Cand.Blocks.push_back(C);
+
+    return Cand;
+  }
+
+  // Create a ModelledPHI for each PHI in BB, adding to PHIs.
+  void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+                          std::set<Value *> &PHIContents) {
+    for (auto &I : *BB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        return;
+
+      auto MPHI = ModelledPHI(PN);
+      PHIs.insert(MPHI);
+      for (auto *V : MPHI.getValues())
+        PHIContents.insert(V);
+    }
+  }
+
+  // The main instruction sinking driver. Set up state and try and sink
+  // instructions into BBEnd from its predecessors.
+  std::pair<unsigned, bool> sinkBB(BasicBlock *BBEnd) {
+    DEBUG(dbgs() << "GVNSink: running on basic block ";
+          BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+    SmallVector<BasicBlock*,4> Preds;
+    for (auto *B : predecessors(BBEnd)) {
+      auto *T = B->getTerminator();
+      if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+        Preds.push_back(B);
+      else
+        return {0, false};
+    }
+    if (Preds.size() < 2)
+      return {0, false};
+    std::stable_sort(Preds.begin(), Preds.end());
+    
+    LockstepReverseIterator LRI(Preds);
+    SmallVector<SinkingInstructionCandidate, 4> Candidates;
+    unsigned InstNum = 0, MemoryInstNum = 0;
+    ModelledPHISet NeededPHIs;
+    std::set<Value*> PHIContents;
+    analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+    unsigned NumOrigPHIs = NeededPHIs.size();
+    
+    while (LRI.isValid()) {
+      auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+                                               NeededPHIs, PHIContents);
+      if (!Cand)
+        break;
+      Cand->calculateCost(NumOrigPHIs, Preds.size());
+      Candidates.emplace_back(*Cand);
+      --LRI;
+    }
+
+    std::stable_sort(Candidates.begin(), Candidates.end());
+    std::reverse(Candidates.begin(), Candidates.end());
+    DEBUG(
+      dbgs() << " -- Sinking candidates:\n";
+      for (auto &C : Candidates)
+        dbgs() << "  " << C << "\n";
+    );
+
+    // Pick the top candidate, as long it is positive!
+    if (Candidates.empty() || Candidates.front().Cost <= 0)
+      return {0, false};
+    auto C = Candidates.front();
+
+    DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+    BasicBlock *InsertBB = BBEnd;
+    if (C.Blocks.size() < Preds.size()) {
+      DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
+            dbgs() << "\n");
+      InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+      if (!InsertBB) {
+        DEBUG(dbgs() << " -- FAILED to split edge!\n");
+        // Edge couldn't be split.
+        return {0, false};
+      }
+      // FIXME: Make SplitBlockPredecessors update PDT too.
+      DT->recalculate(*BBEnd->getParent());
+      PDT->recalculate(*BBEnd->getParent());
+    }
+
+    for (unsigned I = 0; I < C.NumInstructions; ++I)
+      sinkLastInstruction(C.Blocks, InsertBB);
+
+    return {C.NumInstructions,
+            C.NumMemoryInsts > 0 || C.Blocks.size() < Preds.size()};
+  }
+
+  // Perform the actual mechanics of sinking an instruction from Blocks into
+  // BBEnd, which is their only successor.
+  void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks, BasicBlock *BBEnd) {
+    SmallVector<Instruction*,4> Insts;
+    for (auto *BB : Blocks)
+      Insts.push_back(BB->getTerminator()->getPrevNode());
+    Instruction *I0 = Insts.front();
+
+    SmallVector<Value*, 4> NewOperands;
+    for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+      bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+          return I->getOperand(O) != I0->getOperand(O);
+        });
+      if (!NeedPHI) {
+        NewOperands.push_back(I0->getOperand(O));
+        continue;
+      }
+
+      // Create a new PHI in the successor block and populate it.
+      auto *Op = I0->getOperand(O);
+      assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+      auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                                 Op->getName() + ".sink", &BBEnd->front());
+      for (auto *I : Insts)
+        PN->addIncoming(I->getOperand(O), I->getParent());
+      NewOperands.push_back(PN);
+    }
+
+    // FIXME: I'm certain this MemorySSA update code is wrong, but not sure
+    // how to fix it.
+    if (isMemoryInst(I0)) {
+      MemoryPhi *PN = MSSA->getMemoryAccess(BBEnd);
+      MemoryPhi *CreatedPN = nullptr;
+      if (!PN)
+        PN = CreatedPN = MSSA->createMemoryPhi(BBEnd);
+      for (auto *I : Insts) {
+        auto *MA = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(I));
+        if (CreatedPN)
+          PN->addIncoming(MA->getDefiningAccess(), I->getParent());
+        else
+          PN->setIncomingValue(PN->getBasicBlockIndex(I->getParent()),
+                               MA->getDefiningAccess());
+        MSSA->removeMemoryAccess(MA);
+      }
+    }
+
+    // Arbitrarily use I0 as the new "common" instruction; remap its operands
+    // and move it to the start of the successor block.
+    for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+      I0->getOperandUse(O).set(NewOperands[O]);
+    I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+    // Update metadata and IR flags.
+    for (auto *I : Insts)
+      if (I != I0) {
+        combineMetadataForCSE(I0, I);
+        I0->andIRFlags(I);
+      }
+
+    if (isMemoryInst(I0)) {
+      auto *PN = MSSA->getMemoryAccess(BBEnd);
+      MSSA->createMemoryAccessInBB(I0, PN, BBEnd, MemorySSA::Beginning);
+    }
+    
+    for (auto *I : Insts)
+      if (I != I0)
+        I->replaceAllUsesWith(I0);
+    foldPointlessPHINodes(BBEnd);
+
+    // Finally nuke all instructions apart from the common instruction.
+    for (auto *I : Insts)
+      if (I != I0)
+        I->eraseFromParent();
+    // FIXME: !
+    MSSA->invalidateAll();
+  }
+
+  // Remove PHIs that all have the same incoming value.
+  void foldPointlessPHINodes(BasicBlock *BB) {
+    auto I = BB->begin();
+    while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+      if (!all_of(PN->incoming_values(), [&](const Value *V) {
+            return V == PN->getIncomingValue(0);
+          }))
+        continue;
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+
+      MD->removeInstruction(PN);  // Memdep updates AA itself.
+      PN->eraseFromParent();
+    }
+  }
+  
+  // Sink all expressions. Return true if a memory instruction was sunk.
+  std::pair<unsigned, bool> sinkExpressions() {
+    if (!PDT->getRootNode())
+      // This can happen if the function has no returns.
+      return {0, 0};
+    for (auto &N : depth_first(PDT)) {
+      auto Res = sinkBB(N->getBlock());
+      if (Res.first)
+        return Res;
+    }
+    return {0, false};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+  
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNSinkLegacyPass() : FunctionPass(ID) {
+    initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+    auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    
+    GVNSink G(&DT, &PDT, &AA, &MD, &MSSA, &TTI, F.optForMinSize());
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+};
+} // namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F,
+                                   FunctionAnalysisManager &AM) {
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  GVNSink G(&DT, &PDT, &AA, &MD, &MSSA, &TTI, F.optForMinSize());
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+                      "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+                    "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -46,6 +46,7 @@
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
+  initializeGVNSinkLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
   initializeInductiveRangeCheckEliminationPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
Index: lib/Transforms/Utils/MemorySSA.cpp
===================================================================
--- lib/Transforms/Utils/MemorySSA.cpp
+++ lib/Transforms/Utils/MemorySSA.cpp
@@ -1232,6 +1232,22 @@
       MA.dropAllReferences();
 }
 
+void MemorySSA::invalidateAll() {
+  // Drop all our references
+  for (const auto &Pair : PerBlockAccesses)
+    for (MemoryAccess &MA : *Pair.second)
+      MA.dropAllReferences();
+  LiveOnEntryDef = nullptr;
+  Walker = nullptr;
+  NextID = 0;
+  ValueToMemoryAccess.clear();
+  PerBlockAccesses.clear();
+  BlockNumberingValid.clear();
+  BlockNumbering.clear();
+  
+  buildMemorySSA();
+}
+  
 MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
   auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
 
Index: test/Transforms/GVNSink/sink-common-code.ll
===================================================================
--- /dev/null
+++ test/Transforms/GVNSink/sink-common-code.ll
@@ -0,0 +1,694 @@
+; RUN: opt < %s -gvn-sink -simplifycfg -simplifycfg-sink-common=false -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+declare i32 @foo(i32, i32) nounwind readnone
+
+define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test3
+; CHECK: select
+; CHECK: call
+; CHECK: call
+; CHECK: add
+; CHECK-NOT: br
+
+define i32 @test4(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test4
+; CHECK: select
+; CHECK: store
+; CHECK-NOT: store
+
+define i32 @test5(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test5
+; CHECK: store volatile
+; CHECK: store
+
+define i32 @test6(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test6
+; CHECK: select
+; CHECK: store volatile
+; CHECK-NOT: store
+
+define i32 @test7(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %z = load volatile i32, i32* %y
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test7
+; CHECK-DAG: select
+; CHECK-DAG: load volatile
+; CHECK: store volatile
+; CHECK-NOT: load
+; CHECK-NOT: store
+
+; The extra store in %if.then means %z and %w are not equivalent.
+define i32 @test9(i1 zeroext %flag, i32 %x, i32* %y, i32* %p) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  store i32 7, i32* %p
+  %z = load volatile i32, i32* %y
+  store i32 6, i32* %p
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test9
+; CHECK: add
+; CHECK: add
+
+%struct.anon = type { i32, i32 }
+
+; The GEP indexes a struct type so cannot have a variable last index.
+define i32 @test10(i1 zeroext %flag, i32 %x, i32* %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 5
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  store volatile i32 %x, i32* %gepa
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %x, 6
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  store volatile i32 %x, i32* %gepb
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test10
+; CHECK: getelementptr
+; CHECK: store volatile
+; CHECK: getelementptr
+; CHECK: store volatile
+
+; The shufflevector's mask operand cannot be merged in a PHI.
+define i32 @test11(i1 zeroext %flag, i32 %w, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %w, 5
+  %sv1 = shufflevector <2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 0, i32 1>
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %w, 6
+  %sv2 = shufflevector <2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 1, i32 0>
+  br label %if.end
+
+if.end:
+  %p = phi <2 x i32> [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test11
+; CHECK: shufflevector
+; CHECK: shufflevector
+
+; We can't common an intrinsic!
+define i32 @test12(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %w, 5
+  %sv1 = call i32 @llvm.ctlz.i32(i32 %x)
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %w, 6
+  %sv2 = call i32 @llvm.cttz.i32(i32 %x)
+  br label %if.end
+
+if.end:
+  %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+
+declare i32 @llvm.ctlz.i32(i32 %x) readnone
+declare i32 @llvm.cttz.i32(i32 %x) readnone
+
+; CHECK-LABEL: test12
+; CHECK: call i32 @llvm.ctlz
+; CHECK: call i32 @llvm.cttz
+
+; The TBAA metadata should be properly combined.
+define i32 @test13(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %z = load volatile i32, i32* %y
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y, !tbaa !3
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y, !tbaa !4
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+!0 = !{ !"an example type tree" }
+!1 = !{ !"int", !0 }
+!2 = !{ !"float", !0 }
+!3 = !{ !"const float", !2, i64 0 }
+!4 = !{ !"special float", !2, i64 1 }
+
+; CHECK-LABEL: test13
+; CHECK-DAG: select
+; CHECK-DAG: load volatile
+; CHECK: store volatile {{.*}}, !tbaa !0
+; CHECK-NOT: load
+; CHECK-NOT: store
+
+; The call should be commoned.
+define i32 @test13a(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %sv1 = call i32 @bar(i32 %x)
+  br label %if.end
+
+if.else:
+  %sv2 = call i32 @bar(i32 %y)
+  br label %if.end
+
+if.end:
+  %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+declare i32 @bar(i32)
+
+; CHECK-LABEL: test13a
+; CHECK: %[[x:.*]] = select i1 %flag
+; CHECK: call i32 @bar(i32 %[[x]])
+
+; The load should be commoned.
+define i32 @test14(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv1 = load i32, i32* %gepa
+  %cmp1 = icmp eq i32 %sv1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %cmp2 = icmp eq i32 %sv2, 57
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test14
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+; The load should be commoned.
+define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  %sv1 = load i32, i32* %gepa
+  %ext1 = zext i32 %sv1 to i64
+  %cmp1 = icmp eq i64 %ext1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %ext2 = zext i32 %sv2 to i64
+  %cmp2 = icmp eq i64 %ext2, 56
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test15
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+define zeroext i1 @test_crash(i1 zeroext %flag, i32* %i4, i32* %m, i32* %n) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %tmp1 = load i32, i32* %i4
+  %tmp2 = add i32 %tmp1, -1
+  store i32 %tmp2, i32* %i4
+  br label %if.end
+
+if.else:
+  %tmp3 = load i32, i32* %m
+  %tmp4 = load i32, i32* %n
+  %tmp5 = add i32 %tmp3, %tmp4
+  store i32 %tmp5, i32* %i4
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: test_crash
+; No checks for test_crash - just ensure it doesn't crash!
+
+define zeroext i1 @test16(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks) {
+
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ 0, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+; CHECK-LABEL: test16
+; CHECK: zext
+; CHECK: zext
+
+define zeroext i1 @test16a(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks, i8* %p) {
+
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  %b1 = sext i8 %frombool1 to i32
+  %b2 = trunc i32 %b1 to i8
+  store i8 %b2, i8* %p
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  %a1 = sext i8 %frombool3 to i32
+  %a2 = trunc i32 %a1 to i8
+  store i8 %a2, i8* %p
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: test16a
+; CHECK: zext
+; CHECK-NOT: zext
+
+define zeroext i1 @test17(i32 %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  switch i32 %flag, label %if.end [
+    i32 0, label %if.then
+    i32 1, label %if.then2
+  ]
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = call i8 @i1toi8(i1 %cmp)
+  %a1 = sext i8 %frombool1 to i32
+  %a2 = trunc i32 %a1 to i8
+  br label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = call i8 @i1toi8(i1 %cmp2)
+  %b1 = sext i8 %frombool3 to i32
+  %b2 = trunc i32 %b1 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %a2, %if.then ], [ %b2, %if.then2 ], [ 0, %entry ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+declare i8 @i1toi8(i1)
+
+; FIXME: DISABLED - we don't consider this profitable. We should
+;  - Consider argument setup/return mov'ing for calls, like InlineCost does.
+;  - Consider the removal of the %obeys.0 PHI (zero PHI movement overall)
+
+; DISABLED-CHECK-LABEL: test17
+; DISABLED-CHECK: if.then:
+; DISABLED-CHECK-NEXT: icmp uge
+; DISABLED-CHECK-NEXT: br label %[[x:.*]]
+
+; DISABLED-CHECK: if.then2:
+; DISABLED-CHECK-NEXT: add
+; DISABLED-CHECK-NEXT: icmp ule
+; DISABLED-CHECK-NEXT: br label %[[x]]
+
+; DISABLED-CHECK: [[x]]:
+; DISABLED-CHECK-NEXT: %[[y:.*]] = phi i1 [ %cmp
+; DISABLED-CHECK-NEXT: %[[z:.*]] = call i8 @i1toi8(i1 %[[y]])
+; DISABLED-CHECK-NEXT: br label %if.end
+
+; DISABLED-CHECK: if.end:
+; DISABLED-CHECK-NEXT: phi i8
+; DISABLED-CHECK-DAG: [ %[[z]], %[[x]] ]
+; DISABLED-CHECK-DAG: [ 0, %entry ]
+
+define zeroext i1 @test18(i32 %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  switch i32 %flag, label %if.then3 [
+    i32 0, label %if.then
+    i32 1, label %if.then2
+  ]
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.then3:
+  %add2 = add i32 %nblks, %blksA
+  %cmp3 = icmp ule i32 %add2, %blksA
+  %frombool4 = zext i1 %cmp3 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ %frombool4, %if.then3 ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+; CHECK-LABEL: test18
+; CHECK: if.end:
+; CHECK-NEXT: %[[x:.*]] = phi i1
+; CHECK-DAG: [ %cmp, %if.then ]
+; CHECK-DAG: [ %cmp2, %if.then2 ]
+; CHECK-DAG: [ %cmp3, %if.then3 ]
+; CHECK-NEXT: zext i1 %[[x]] to i8
+
+; The phi is confusing - both add instructions are used by it, but
+; not on their respective unconditional arcs. It should not be
+; optimized.
+define void @test_pr30292(i1 %cond, i1 %cond2, i32 %a, i32 %b) {
+entry:
+  %add1 = add i32 %a, 1
+  br label %succ
+
+one:
+  br i1 %cond, label %two, label %succ
+
+two:
+  call void @g()
+  %add2 = add i32 %a, 1
+  br label %succ
+
+succ:
+  %p = phi i32 [ 0, %entry ], [ %add1, %one ], [ %add2, %two ]
+  br label %one
+}
+declare void @g()
+
+; CHECK-LABEL: test_pr30292
+; CHECK: phi i32 [ 0, %entry ], [ %add1, %succ ], [ %add2, %two ]
+
+define zeroext i1 @test_pr30244(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks) {
+
+entry:
+  %p = alloca i8
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  store i8 %frombool1, i8* %p
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  store i8 %frombool3, i8* %p
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: @test_pr30244
+; CHECK: store
+; CHECK: store
+
+define i32 @test_pr30373a(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  %z0 = lshr i32 %y0, 8
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  %z1 = lshr exact i32 %y1, 8
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %z0, %if.then ], [ %z1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_pr30373a
+; CHECK: lshr
+; CHECK-NOT: exact
+; CHECK: }
+
+define i32 @test_pr30373b(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  %z0 = lshr exact i32 %y0, 8
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  %z1 = lshr i32 %y1, 8
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %z0, %if.then ], [ %z1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_pr30373b
+; CHECK: lshr
+; CHECK-NOT: exact
+; CHECK: }
+
+; CHECK: !0 = !{!1, !1, i64 0}
+; CHECK: !1 = !{!"float", !2}
+; CHECK: !2 = !{!"an example type tree"}
Index: test/Transforms/SimplifyCFG/sink-common-code.ll
===================================================================
--- test/Transforms/SimplifyCFG/sink-common-code.ll
+++ test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -414,7 +414,7 @@
   %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
   %sv2 = load i32, i32* %gepb
   %ext2 = zext i32 %sv2 to i64
-  %cmp2 = icmp eq i64 %ext2, 57
+  %cmp2 = icmp eq i64 %ext2, 56
   br label %if.end
 
 if.end:
@@ -488,7 +488,9 @@
 if.then:
   %cmp = icmp uge i32 %blksA, %nblks
   %frombool1 = zext i1 %cmp to i8
-  store i8 %frombool1, i8* %p
+  %b1 = sext i8 %frombool1 to i32
+  %b2 = trunc i32 %b1 to i8
+  store i8 %b2, i8* %p
   br label %if.end
 
 if.else:
@@ -498,7 +500,9 @@
   %add = add i32 %nblks, %blksB
   %cmp2 = icmp ule i32 %add, %blksA
   %frombool3 = zext i1 %cmp2 to i8
-  store i8 %frombool3, i8* %p
+  %a1 = sext i8 %frombool3 to i32
+  %a2 = trunc i32 %a1 to i8
+  store i8 %a2, i8* %p
   br label %if.end
 
 if.end:
@@ -519,16 +523,20 @@
 if.then:
   %cmp = icmp uge i32 %blksA, %nblks
   %frombool1 = call i8 @i1toi8(i1 %cmp)
+  %a1 = sext i8 %frombool1 to i32
+  %a2 = trunc i32 %a1 to i8
   br label %if.end
 
 if.then2:
   %add = add i32 %nblks, %blksB
   %cmp2 = icmp ule i32 %add, %blksA
   %frombool3 = call i8 @i1toi8(i1 %cmp2)
+  %b1 = sext i8 %frombool3 to i32
+  %b2 = trunc i32 %b1 to i8
   br label %if.end
 
 if.end:
-  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ 0, %entry ]
+  %obeys.0 = phi i8 [ %a2, %if.then ], [ %b2, %if.then2 ], [ 0, %entry ]
   %tobool4 = icmp ne i8 %obeys.0, 0
   ret i1 %tobool4
 }