diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -260,6 +260,16 @@
     return MA->getValueID() == MemoryUseVal || MA->getValueID() == MemoryDefVal;
   }
 
+  void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false,
+                         Optional<AliasResult> AR = MayAlias) {
+    if (!Optimized) {
+      setOperand(0, DMA);
+      return;
+    }
+    setOptimized(DMA);
+    setOptimizedAccessType(AR);
+  }
+
   // Sadly, these have to be public because they are needed in some of the
   // iterators.
   inline bool isOptimized() const;
@@ -296,16 +306,6 @@
     OptimizedAccessAlias = AR;
   }
 
-  void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false,
-                         Optional<AliasResult> AR = MayAlias) {
-    if (!Optimized) {
-      setOperand(0, DMA);
-      return;
-    }
-    setOptimized(DMA);
-    setOptimizedAccessType(AR);
-  }
-
 private:
   Instruction *MemoryInstruction;
   Optional<AliasResult> OptimizedAccessAlias;
@@ -786,6 +786,7 @@
   /// Used in various insertion functions to specify whether we are talking
   /// about the beginning or end of a block.
   enum InsertionPlace { Beginning, End };
+  MemoryPhi *createMemoryPhi(BasicBlock *BB);
 
 protected:
   // Used by Memory SSA annotater, dumpers, and wrapper pass
@@ -854,7 +855,6 @@
   determineInsertionPoint(const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks);
   void markUnreachableAsLiveOnEntry(BasicBlock *BB);
   bool dominatesUse(const MemoryAccess *, const MemoryAccess *) const;
-  MemoryPhi *createMemoryPhi(BasicBlock *BB);
   template <typename AliasAnalysisType>
   MemoryUseOrDef *createNewAccess(Instruction *, AliasAnalysisType *,
                                   const MemoryUseOrDef *Template = nullptr);
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -386,9 +386,13 @@
   // This ends the loop pass pipelines.
 
   if (OptLevel > 1) {
+    if (EnableGVNHoist)
+      MPM.add(createGVNHoistPass());      
     MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
     MPM.add(NewGVN ? createNewGVNPass()
                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+    if (EnableGVNHoist)
+      MPM.add(createGVNHoistPass());    
   }
   MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
   MPM.add(createSCCPPass());                  // Constant prop with SCCP
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -31,6 +31,49 @@
 // is disabled in the following cases.
 // 1. Scalars across calls.
 // 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO:
+// For -Oz scalars are always safe to hoist => NO.
+// For -O2/-O3 hoist only when the live range improves or remains the same.
+// If we haven't computed dominator tree levels, do so now.
+// For sink operation anything after hoist barrier is okay to sink
+// but nothing before the hoist barrier. See Danny's patch on HoistBarriers.
+//
+// Find sink opportunities: sink instructions to a common post dominator when
+// they compute the same value and hoisting isn't profitable because that will
+// increase the liveness. In this case sinking may decrease the liveness when
+// the instruction can be delayed. But do not delay so much that latency of the
+// operation takes over. Like divide takes 10 cycles maybe, and if the divide is
+// sunk into a BB where it is used then the result of divide may take long time
+// anyways. So an estimated latency of operation should be computed and the
+// instruction should be sunk only till the point when result may be readily
+// available for the user of the instruction.
+//
+// Sinking store may not always be beneficial: sinking frees up a register so
+// improves register allocation the stored result may be loaded again, in that
+// case it is good to store soon (??)
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+//   switch i32 %c1, label %exit1 [
+//     i32 0, label %sw0
+//     i32 1, label %sw1
+//   ]
+//
+// sw0:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// sw1:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// exit1:
+//   store i32 1, i32* @G
+//   ret void
+// exit:
+//   ret void
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
@@ -84,6 +127,7 @@
 #define DEBUG_TYPE "gvn-hoist"
 
 STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumRemoved, "Number of instructions removed");
 STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
 STATISTIC(NumLoadsRemoved, "Number of loads removed");
@@ -91,6 +135,7 @@
 STATISTIC(NumStoresRemoved, "Number of stores removed");
 STATISTIC(NumCallsHoisted, "Number of calls hoisted");
 STATISTIC(NumCallsRemoved, "Number of calls removed");
+// STATISTIC(NumBarriers, "Number of barriers");
 
 static cl::opt<int>
     MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
@@ -111,12 +156,20 @@
     MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
                    cl::desc("Maximum length of dependent chains to hoist "
                             "(default = 10, unlimited = -1)"));
+static cl::opt<bool>
+    CheckHoistProfitability("gvn-hoist-check-profitability", cl::Hidden, cl::init(true),
+                   cl::desc("Check for proitability (reducing register pressure)"));
 
+static cl::opt<bool>
+    CheckSinkProfitability("gvn-sink-check-profitability", cl::Hidden, cl::init(true),
+                   cl::desc("Check for proitability (reducing register pressure)"));
 namespace llvm {
 
 using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
 using SmallVecInsn = SmallVector<Instruction *, 4>;
 using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
+using SmallVecVal = SmallVector<Value*, 2>;
+using SmallVecImplVal = SmallVectorImpl<Value*>;
 
 // Each element of a hoisting list contains the basic block where to hoist and
 // a list of instructions to be hoisted.
@@ -129,6 +182,14 @@
 
 using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
 
+typedef SmallSet<BasicBlock *, 2> SmallSetBB;
+typedef DenseMap<BasicBlock *, SmallSetBB> MergeSetT;
+typedef SmallVector<BasicBlock *, 4> SmallVecBB;
+typedef SmallVecBB BBLevelKeyT;
+typedef std::map<unsigned, BBLevelKeyT> BBLevelT;
+typedef DenseMap<BasicBlock *, unsigned> DomLevelsT;
+typedef std::pair<BasicBlock *, BasicBlock *> EdgeT;
+
 // CHI keeps information about values flowing out of a basic block.  It is
 // similar to PHI but in the inverse graph, and used for outgoing values on each
 // edge. For conciseness, it is computed only for instructions with multiple
@@ -157,6 +218,7 @@
 using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
 using InValuesType =
     DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
+typedef DenseMap<VNType, SmallVector<Instruction *, 2>> RenameStackType;
 
 // An invalid value number Used when inserting a single value number into
 // VNtoInsns.
@@ -174,7 +236,11 @@
     VNtoScalars[{V, InvalidVN}].push_back(I);
   }
 
+  void clear() { VNtoScalars.clear(); }
+
+  VNtoInsns &getVNTable() { return VNtoScalars; }
   const VNtoInsns &getVNTable() const { return VNtoScalars; }
+  void removeVN(unsigned V) { VNtoScalars.erase({V, InvalidVN}); }
 };
 
 // Records all load instructions candidate for code hoisting.
@@ -190,7 +256,10 @@
     }
   }
 
+  void clear() { VNtoLoads.clear(); }
+  VNtoInsns &getVNTable() { return VNtoLoads; }
   const VNtoInsns &getVNTable() const { return VNtoLoads; }
+  void removeVN(unsigned V) { VNtoLoads.erase({V, InvalidVN}); }
 };
 
 // Records all store instructions candidate for code hoisting.
@@ -209,7 +278,11 @@
     VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
   }
 
+  void clear() { VNtoStores.clear(); }
+
+  VNtoInsns &getVNTable() { return VNtoStores; }
   const VNtoInsns &getVNTable() const { return VNtoStores; }
+  void removeVN(unsigned V) { VNtoStores.erase({V, InvalidVN}); }
 };
 
 // Records all call instructions candidate for code hoisting.
@@ -235,9 +308,24 @@
       VNtoCallsStores[Entry].push_back(Call);
   }
 
+  void clear() {
+    VNtoCallsScalars.clear();
+    VNtoCallsLoads.clear();
+    VNtoCallsStores.clear();
+  }
+
+  VNtoInsns &getScalarVNTable() { return VNtoCallsScalars; }
+  VNtoInsns &getLoadVNTable() { return VNtoCallsLoads; }
+  VNtoInsns &getStoreVNTable() { return VNtoCallsStores; }
   const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
   const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
   const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
+
+  void removeVN(unsigned V) {
+    if(!VNtoCallsScalars.erase({V, InvalidVN}))
+      if(!VNtoCallsLoads.erase({V, InvalidVN}))
+        VNtoCallsStores.erase({V, InvalidVN});
+  }
 };
 
 static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
@@ -249,18 +337,164 @@
   combineMetadata(ReplInst, I, KnownIDs, true);
 }
 
+void printBBLevels(const BBLevelT &BBLevels) {
+  for (const std::pair<unsigned, BBLevelKeyT> &P : BBLevels) {
+    dbgs() << "\nLevel: " << P.first << "\n";
+    for (const BasicBlock *BB : P.second)
+      dbgs() << *BB << "\n";
+  }
+}
+
+void printMergeSet(const MergeSetT &M) {
+  // For printing in a deterministic order.
+  typedef std::set<const BasicBlock *> SetConstBB;
+  auto cmp = [](const BasicBlock *A, const BasicBlock *B) {
+    return A->getName() < B->getName();
+  };
+  std::map<BasicBlock *, SetConstBB, decltype(cmp)> PrintM(cmp);
+  for (const std::pair<BasicBlock *, SmallSetBB> &P : M) {
+    for (const BasicBlock *BB : P.second)
+      PrintM[P.first].insert(BB);
+  }
+  for (const std::pair<BasicBlock *, SetConstBB> &P : PrintM) {
+    dbgs() << "\nMergeSet of: " << P.first->getName() << ": ";
+    for (const BasicBlock *BB : P.second)
+      dbgs() << BB->getName() << ", ";
+  }
+}
+
+void printJEdges(const DenseSet<EdgeT> &Edges) {
+  auto cmp = [](const EdgeT &A, const EdgeT &B) {
+    return A.first->getName() < B.first->getName();
+  };
+  // For printing in a deterministic order.
+  std::set<EdgeT, decltype(cmp)> PrintE(Edges.begin(), Edges.end(), cmp);
+
+  for (const EdgeT &E : PrintE)
+    dbgs() << "\nFound a JEdge: " << E.first->getName() << " -> "
+           << E.second->getName();
+}
+
+void printSmallSet(SmallSetBB &S) {
+  dbgs() << "\nPrinting SmallSet: ";
+  for (const auto &BB : S)
+    dbgs() << BB->getName() << ",";
+}
+
+static inline void updateLocalStats(const Instruction *Repl, unsigned &NI,
+                                    unsigned &NL, unsigned &NS, unsigned &NC) {
+  if (isa<LoadInst>(Repl))
+    ++NL;
+  else if (isa<StoreInst>(Repl))
+    ++NS;
+  else if (isa<CallInst>(Repl))
+    ++NC;
+  else // Scalar
+    ++NI;
+}
+
+static inline void updateGlobalStats(unsigned NI, unsigned NL, unsigned NS,
+                                     unsigned NC, unsigned NR) {
+  NumHoisted += NL + NS + NC + NI;
+  NumRemoved += NR;
+  NumLoadsHoisted += NL;
+  NumStoresHoisted += NS;
+  NumCallsHoisted += NC;
+}
+
 // This pass hoists common computations across branches sharing common
 // dominator. The primary goal is to reduce the code size, and in some
 // cases reduce critical path (by exposing more ILP).
 class GVNHoist {
 public:
+  // Each element of a hoisting list contains the basic block where to hoist and
+  // a list of instructions to be hoisted.
+  typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo;
+  typedef SmallVector<HoistingPointInfo, 4> HoistingPointList;
+
   GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
            MemoryDependenceResults *MD, MemorySSA *MSSA)
       : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
-        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)),
+        HoistingGeps(false), HoistedCtr(0) {
+    clearVNTables();
+  }
+
+  void clearVNTables() {
+    II.clear();
+    LI.clear();
+    SI.clear();
+    CI.clear();
+  }
+
+  // DomLevels maps from BB -> its depth from root.
+  // JEdges only contain the J edges as D edges are available in Dominator Tree.
+  // BBLevels maps each depth in the CFG to all the Basic Blocks at that level.
+  // DJ Graph is described in "Sreedhar, Vugranam C. Efficient program analysis
+  // using DJ graphs. McGill University, 1996".
+  void constructDJGraph(DomLevelsT &DomLevels, DenseSet<EdgeT> &JEdges,
+                        BBLevelT &BBLevels);
+
+  // Return true if S1 is a subset of S2.
+  bool isSubset(const SmallSetBB &S1, const SmallSetBB &S2) {
+    if (S1.size() > S2.size())
+      return false;
+    for (BasicBlock *BB : S1) {
+      if (!S2.count(BB))
+        return false;
+    }
+    return true;
+  }
+
+  // Returns true when A executes before B.
+  bool DFSInOrder(const Instruction *A, const Instruction *B) const {
+    const BasicBlock *BA = A->getParent();
+    const BasicBlock *BB = B->getParent();
+    unsigned ADFS, BDFS;
+    if (BA == BB) {
+      ADFS = DFSNumber.lookup(A);
+      BDFS = DFSNumber.lookup(B);
+    } else {
+      ADFS = DFSNumber.lookup(BA);
+      BDFS = DFSNumber.lookup(BB);
+    }
+    assert(ADFS && BDFS);
+    return ADFS < BDFS;
+  }
+
+  // DomLevels maps BB to its depth from root.
+  // JEdges only contain the J-edges as D-edges are available in Dominator Tree.
+  // BBLevels maps each depth in the CFG to all the BBs at that level.
+  // BILARDI, G. AND PINGALI, K. 2003. Algorithms for computing the static
+  // single assignment form. J. ACM 50, 3 (May), 375–425.
+  bool constructMergeSet(DomLevelsT &DomLevels, DenseSet<EdgeT> &JEdges,
+                         BBLevelT &BBLevels);
+
+  // Returns true if the \p Val is the last use at \p I.
+  // TODO: Find O(1) algorithm for this.
+  const Instruction *lastUser(const Instruction *I, const Value *Val) const;
+
+  // Returns true if the \p Val is live-out from \p BB.
+  bool isLiveOutUsingMergeSet(BasicBlock *BB, Value *Val) const;
 
   bool run(Function &F) {
+    if (F.hasMinSize()) {
+      CheckHoistProfitability = false;
+      CheckSinkProfitability = false;
+    }
     NumFuncArgs = F.arg_size();
+    //DT->recalculate(F);
+    DT->updateDFSNumbers();
+    DomLevelsT DomLevels;
+    DenseSet<EdgeT> JEdges;
+    BBLevelT BBLevels;
+    constructDJGraph(DomLevels, JEdges, BBLevels);
+    // printBBLevels(BBLevels);
+    LLVM_DEBUG(printJEdges(JEdges));
+    while (constructMergeSet(DomLevels, JEdges, BBLevels))
+      ;
+    LLVM_DEBUG(printMergeSet(MergeSet));
+
     VN.setDomTree(DT);
     VN.setAliasAnalysis(AA);
     VN.setMemDep(MD);
@@ -281,9 +515,10 @@
       if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
         return Res;
 
+      clearVNTables();
       auto HoistStat = hoistExpressions(F);
       if (HoistStat.first + HoistStat.second == 0)
-        return Res;
+        break;
 
       if (HoistStat.second > 0)
         // To address a limitation of the current GVN, we need to rerun the
@@ -294,6 +529,8 @@
       Res = true;
     }
 
+    clearVNTables();
+    sinkExpressions(F);
     return Res;
   }
 
@@ -335,15 +572,179 @@
   std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   DenseMap<const Value *, unsigned> DFSNumber;
   BBSideEffectsSet BBSideEffects;
-  DenseSet<const BasicBlock *> HoistBarrier;
   SmallVector<BasicBlock *, 32> IDFBlocks;
   unsigned NumFuncArgs;
   const bool HoistingGeps = false;
+  MergeSetT MergeSet;
+  DenseSet<const BasicBlock *> HoistBarrier;
+  int HoistedCtr;
+  InsnInfo II;
+  LoadInfo LI;
+  StoreInfo SI;
+  CallInfo CI;
 
   enum InsKind { Unknown, Scalar, Load, Store };
 
+  bool hoistCandidate(const User *U) const {
+    if (!VN.exists(const_cast<User *>(U))) // Only for scalars
+      return false;
+    unsigned V = VN.lookup(const_cast<User *>(U));
+
+    VNType L({V, InvalidVN});
+
+    // Multiple scalars with same VN have very high chance of being hoisted.
+    if (II.getVNTable().count(L) > 1)
+      return true;
+
+    // Multiple loads with same VN have very high chance of being hoisted.
+    if (LI.getVNTable().count(L) > 1)
+      return true;
+
+    // Multiple stores with same VN have very high chance of being hoisted.
+    if (SI.getVNTable().count(L) > 1)
+      return true;
+
+    // Multiple calls with same VN have very high chance of being hoisted.
+    if (CI.getLoadVNTable().count(L) > 1)
+      return true;
+
+    // Multiple calls with same VN have very high chance of being hoisted.
+    if (CI.getScalarVNTable().count(L) > 1)
+      return true;
+
+    // Multiple calls with same VN have very high chance of being hoisted.
+    if (CI.getStoreVNTable().count(L) > 1)
+      return true;
+    return false;
+  }
+
+  bool successorDominate(const BasicBlock *BB, const BasicBlock *A);
+
   // Return true when there are exception handling in BB.
-  bool hasEH(const BasicBlock *BB) {
+  bool hasEH(const BasicBlock *BB);
+
+  bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+                   int &NBBsOnAllPaths);
+
+  // Return true when there are exception handling or loads of memory Def
+  // between Def and NewPt.  This function is only called for stores: Def is
+  // the MemoryDef of the store to be hoisted.
+  //
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaces 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+                          int &NBBsOnAllPaths);
+
+  // Return true when there are exception handling between HoistPt and BB.
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaches 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+                   int &NBBsOnAllPaths);
+
+  // Return true when there are memory uses of Def in BB.
+  bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+                    const BasicBlock *BB);
+
+  bool firstInBB(const Instruction *I1, const Instruction *I2) const;
+
+  // Return true when it is safe to hoist scalar instructions from all blocks in
+  // WL to HoistBB. TODO: Hoisting scalars past a call only when the
+  // interference does not increase.
+  bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
+                         int &NBBsOnAllPaths);
+  // Return true when it is safe to hoist a memory load or store U from OldPt
+  // to NewPt.
+  bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
+                       MemoryUseOrDef *MA, InsKind K, int &NBBsOnAllPaths);
+  bool safeToHoistLdSt_0(const Instruction *NewHoistPt,
+                         const Instruction *HoistPt, const Instruction *Insn,
+                         MemoryUseOrDef *MA, GVNHoist::InsKind K,
+                         int &NumBBsOnAllPaths, const BasicBlock *HoistPtBB,
+                         const BasicBlock *NewHoistBB, const BasicBlock *InsnBB,
+                         SmallPtrSetImpl<const BasicBlock *> &WL);
+  // Return true when it is profitable to hoist I1.
+  bool profitableToHoist(Instruction *I) const;
+  bool profitableToHoist(CHIArgs C) const;
+  // Return true when it is profitable to hoist I1.
+  bool profitableToSink(Instruction *I) const;
+
+  bool safeToSinkToEnd(Instruction *I, BasicBlock *BB, GVNHoist::InsKind K);
+
+  // Find sinkable instructions.
+  void findSinkableInsn(VNtoInsns &Map, HoistingPointList &HPL,
+                        InsKind K);
+
+  bool valueAnticipable(CHIArgs C, Instruction *TI) const;
+  void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
+                   SmallVectorImpl<CHIArg> &Safe);
+
+  // Push all the VNs corresponding to BB into RenameStack.
+  void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+                       RenameStackType &RenameStack);
+
+  void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+                   RenameStackType &RenameStack);
+
+  void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs);
+
+  void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
+                               HoistingPointList &HPL);
+
+  void computeInsertionPoints(VNtoInsns &Map, HoistingPointList &HPL,
+                              InsKind K);
+
+  // Return true when all operands of Instr are available at insertion point
+  // HoistPt. When limiting the number of hoisted expressions, one could hoist
+  // a load without hoisting its access function. So before hoisting any
+  // expression, make sure that all its operands are available at insert point.
+  bool allOperandsAvailable(Instruction *I,
+                            BasicBlock *NewDest,
+                            SmallVecImplVal &Unav) const;
+  // Same as allOperandsAvailable with recursive check for GEP operands.
+  bool allGepOperandsAvailable(const Instruction *I,
+                               const BasicBlock *HoistPt) const;
+
+  // Make all operands of the GEP available.
+  void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                         const SmallVecInsn &InstructionsToHoist,
+                         Instruction *Gep) const;
+
+  // In the case Repl is a load or a store, we make all their GEPs
+  // available: GEPs are not hoisted by default to avoid the address
+  // computations to be hoisted without the associated load or store.
+  bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                                const SmallVecInsn &InstructionsToHoist) const;
+
+  void updateAlignment(Instruction *I, Instruction *Repl);
+
+  void removeAndReplace(const SmallVecInsn &InstructionsToHoist,
+                            Instruction *Repl);
+  void removeAndReplace(Instruction *I, Instruction *Repl,
+                        MemoryAccess *NewMemAcc);
+
+  // Remove and rename all instructions other than Repl.
+  unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
+                            BasicBlock *DestBB, bool MoveAccess);
+
+  void removeMPhi(MemoryAccess *NewMemAcc);
+  unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+                MemoryUseOrDef *NewMemAcc);
+  void raMPHIuw(MemoryUseOrDef *NewMemAcc);
+
+  std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL);
+  std::pair<unsigned, unsigned> sink(HoistingPointList &HPL);
+  bool getVN(Instruction *I);
+  void barriersAndVNs(Function &F, bool Sinking);
+  // Hoist all expressions. Returns Number of scalars hoisted
+  // and number of non-scalars hoisted.
+  std::pair<unsigned, unsigned> hoistExpressions(Function &F);
+  std::pair<unsigned, unsigned> sinkExpressions(Function &F);
+};
+
+// Return true when there are exception handling in BB.
+bool GVNHoist::hasEH(const BasicBlock *BB) {
     auto It = BBSideEffects.find(BB);
     if (It != BBSideEffects.end())
       return It->second;
@@ -352,36 +753,34 @@
       BBSideEffects[BB] = true;
       return true;
     }
-
     if (BB->getTerminator()->mayThrow()) {
       BBSideEffects[BB] = true;
       return true;
     }
-
     BBSideEffects[BB] = false;
     return false;
-  }
+}
 
-  // Return true when a successor of BB dominates A.
-  bool successorDominate(const BasicBlock *BB, const BasicBlock *A) {
-    for (const BasicBlock *Succ : successors(BB))
+// Return true when a successor of BB dominates A.
+bool GVNHoist::successorDominate(const BasicBlock *BB, const BasicBlock *A) {
+  for (const BasicBlock *Succ : successors(BB))
       if (DT->dominates(Succ, A))
         return true;
 
     return false;
-  }
+}
 
-  // Return true when I1 appears before I2 in the instructions of BB.
-  bool firstInBB(const Instruction *I1, const Instruction *I2) {
+// Return true when I1 appears before I2 in the instructions of BB.
+bool GVNHoist::firstInBB(const Instruction *I1, const Instruction *I2) const {
     assert(I1->getParent() == I2->getParent());
     unsigned I1DFS = DFSNumber.lookup(I1);
     unsigned I2DFS = DFSNumber.lookup(I2);
     assert(I1DFS && I2DFS);
     return I1DFS < I2DFS;
-  }
+}
 
-  // Return true when there are memory uses of Def in BB.
-  bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+// Return true when there are memory uses of Def in BB.
+bool GVNHoist::hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
                     const BasicBlock *BB) {
     const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
     if (!Acc)
@@ -413,9 +812,9 @@
       }
 
     return false;
-  }
+}
 
-  bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+bool GVNHoist::hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths) {
     // Stop walk once the limit is reached.
     if (NBBsOnAllPaths == 0)
@@ -432,16 +831,16 @@
       return true;
 
     return false;
-  }
+}
 
-  // Return true when there are exception handling or loads of memory Def
-  // between Def and NewPt.  This function is only called for stores: Def is
-  // the MemoryDef of the store to be hoisted.
+// Return true when there are exception handling or loads of memory Def
+// between Def and NewPt.  This function is only called for stores: Def is
+// the MemoryDef of the store to be hoisted.
 
-  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
-  // return true when the counter NBBsOnAllPaths reaces 0, except when it is
-  // initialized to -1 which is unlimited.
-  bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+// return true when the counter NBBsOnAllPaths reaces 0, except when it is
+// initialized to -1 which is unlimited.
+bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
                           int &NBBsOnAllPaths) {
     const BasicBlock *NewBB = NewPt->getParent();
     const BasicBlock *OldBB = Def->getBlock();
@@ -476,13 +875,13 @@
     }
 
     return false;
-  }
+}
 
-  // Return true when there are exception handling between HoistPt and BB.
-  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
-  // return true when the counter NBBsOnAllPaths reaches 0, except when it is
-  // initialized to -1 which is unlimited.
-  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+// Return true when there are exception handling between HoistPt and BB.
+// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+// return true when the counter NBBsOnAllPaths reaches 0, except when it is
+// initialized to -1 which is unlimited.
+bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths) {
     assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
 
@@ -510,12 +909,14 @@
     }
 
     return false;
-  }
+}
+
+// Return true when it is safe to hoist a memory load or store U from OldPt
+// to NewPt.
+bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
+                               const Instruction *OldPt, MemoryUseOrDef *U,
+                               InsKind K, int &NBBsOnAllPaths) {
 
-  // Return true when it is safe to hoist a memory load or store U from OldPt
-  // to NewPt.
-  bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
-                       MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
     // In place hoisting is safe.
     if (NewPt == OldPt)
       return true;
@@ -553,68 +954,275 @@
 
     // No side effects: it is safe to hoist.
     return true;
-  }
+}
 
-  // Return true when it is safe to hoist scalar instructions from all blocks in
-  // WL to HoistBB.
-  bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
-                         int &NBBsOnAllPaths) {
+// Return true when it is safe to hoist scalar instructions from all blocks in
+// WL to HoistBB.
+bool GVNHoist::safeToHoistScalar(const BasicBlock *HoistBB,
+                                 const BasicBlock *BB, int &NBBsOnAllPaths) {
     return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
+}
+
+bool GVNHoist::safeToHoistLdSt_0(const Instruction *NewHoistPt,
+                                 const Instruction *HoistPt,
+                                 const Instruction *Insn, MemoryUseOrDef *MA,
+                                 GVNHoist::InsKind K, int &NumBBsOnAllPaths,
+                                 const BasicBlock *HoistPtBB,
+                                 const BasicBlock *NewHoistBB,
+                                 const BasicBlock *InsnBB,
+                                 SmallPtrSetImpl<const BasicBlock *> &WL) {
+  return (HoistPtBB == NewHoistBB || InsnBB == NewHoistBB
+          /*hoistingFromAllPaths(NewHoistBB, WL)*/) &&
+         // Also check that it is safe to move the load or store from HoistPt
+         // to NewHoistPt, and from Insn to NewHoistPt. Note that HoistPt may
+         // not be the instruction to be hoisted, it is a transient placeholder
+         // to find the farthest hoisting point when >2 hoistable candidates
+         // can be hoisted to a common dominator.
+         safeToHoistLdSt(NewHoistPt, HoistPt, MA, K, NumBBsOnAllPaths) &&
+         safeToHoistLdSt(NewHoistPt, Insn, MSSA->getMemoryAccess(Insn), K,
+                         NumBBsOnAllPaths);
+}
+
+bool GVNHoist::profitableToHoist(Instruction *I) const {
+  // For -O3/-O2 hoist only when the liveness decreases i.e., no more than
+  // one operand can be a use without kill.
+  // Store and Calls do not create a register def.
+  if (isa<StoreInst>(I) || isa<CallInst>(I))
+    return true;
+
+  // If Op is a kill then it will not be live-out from its basic block
+  // but the reverse is not true.
+  for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+    Value *Op = I->getOperand(op);
+    //if (isa<Constant>(Op))
+    //  continue;
+    if (isLiveOutUsingMergeSet(I->getParent(), Op))
+      return false;
+    // It is always profitable to hoist when the liveness does not increase,
+    // a Kill will compensate for the def created by this instruction.
+    const Instruction *LU = lastUser(I, Op);
+    if (LU == I)
+      return true;
+    else {
+      // We optimistically assume that if all the users of Op are hoistable
+      // candidates then it is profitable to hoist.
+      bool stillProfitable = true;
+      for (const User *U : Op->users()) {
+        if (!hoistCandidate(U)) {
+          stillProfitable = false;
+          break;
+        }
+        LLVM_DEBUG(dbgs() << "\nstill hoistable:" << *U);
+      }
+      if (stillProfitable)
+        return true;
+    }
   }
+  return false;
+}
 
-  // In the inverse CFG, the dominance frontier of basic block (BB) is the
-  // point where ANTIC needs to be computed for instructions which are going
-  // to be hoisted. Since this point does not change during gvn-hoist,
-  // we compute it only once (on demand).
-  // The ides is inspired from:
-  // "Partial Redundancy Elimination in SSA Form"
-  // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
-  // They use similar idea in the forward graph to find fully redundant and
-  // partially redundant expressions, here it is used in the inverse graph to
-  // find fully anticipable instructions at merge point (post-dominator in
-  // the inverse CFG).
-  // Returns the edge via which an instruction in BB will get the values from.
-
-  // Returns true when the values are flowing out to each edge.
-  bool valueAnticipable(CHIArgs C, Instruction *TI) const {
-    if (TI->getNumSuccessors() > (unsigned)size(C))
-      return false; // Not enough args in this CHI.
+bool GVNHoist::profitableToHoist(CHIArgs C) const {
+  if (!CheckHoistProfitability)
+    return true;
+  int count = 0;
+  for (const auto &A : C) {
+    if (!profitableToHoist(A.I))
+      ++count;
+  }
+  return 2*count <= size(C);
+}
+
+bool GVNHoist::profitableToSink(Instruction *I) const {
+  if (!CheckSinkProfitability)
+    return true;
+  // For -O3/-O2 sink only when the liveness decreases i.e., no more than
+  // one operand can be a kill.
+  // Store and Calls do not create a register def.
+  if (isa<StoreInst>(I) || isa<CallInst>(I))
+    return false;
 
-    for (auto CHI : C) {
-      BasicBlock *Dest = CHI.Dest;
-      // Find if all the edges have values flowing out of BB.
-      bool Found = llvm::any_of(
-          successors(TI), [Dest](const BasicBlock *BB) { return BB == Dest; });
-      if (!Found)
+  // If Op is a kill then it will not be live-out from its basic block
+  // but the reverse is not true.
+  unsigned NumKills = 0;
+  for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+    Value *Op = I->getOperand(op);
+    // It is always profitable to sink when the liveness does not increase.
+    if (isLiveOutUsingMergeSet(I->getParent(), Op))
+      continue;
+    if (!isa<Constant>(Op))
+      ++NumKills;
+    // A Kill will increase the liveness during a sink.
+    const Instruction *LU = lastUser(I, Op);
+    if (LU == I && NumKills > 1)
+      return false;
+  }
+  return true;
+}
+
+// Returns true when the instruction does not result
+// in a hazard with any other instruction till the end of BB.
+bool GVNHoist::safeToSinkToEnd(Instruction *I, BasicBlock *BB,
+                               GVNHoist::InsKind K) {
+  BasicBlock::iterator II(I);
+  auto IMA = MSSA->getMemoryAccess(I);
+  assert(IMA);
+  if (MemoryUseOrDef *I0UD = cast<MemoryUseOrDef>(IMA)) {
+    // Only sink loads for now because sinking load either decreases or
+    // preserves live-range.
+    if (!isa<MemoryUse>(I0UD))
+      return false;
+
+    // MemoryAccess *Def = I0UD->getDefiningAccess();
+    /*/ Updating Memory PHI is tricky, bail out for now.
+    for (User *U : Def->users())
+      if (isa<MemoryPhi>(U))
+        return false;*/
+  }
+  while (++II != BB->end()) { // Skip I
+    auto IIMA = MSSA->getMemoryAccess(&*II);
+    // TODO: Bails out on any memory writes for now, improve for non-aliases.
+    if (IIMA) {
+      if (!isa<MemoryUse>(IIMA))
         return false;
     }
-    return true;
   }
+  return true;
+}
 
-  // Check if it is safe to hoist values tracked by CHI in the range
-  // [Begin, End) and accumulate them in Safe.
-  void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
-                   SmallVectorImpl<CHIArg> &Safe) {
-    int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
-    for (auto CHI : C) {
-      Instruction *Insn = CHI.I;
-      if (!Insn) // No instruction was inserted in this CHI.
+void GVNHoist::findSinkableInsn(VNtoInsns &Map,
+                                GVNHoist::HoistingPointList &HPL,
+                                GVNHoist::InsKind K) {
+  // Sort the VNs by rank, higher ranked should be sunk first.
+  // SmallVector<unsigned, 8> SortedRank;
+  // sortByRank(SortedRank, Map);
+  SmallVector<VNtoInsns::key_type, 4> ToErase;
+  for (const auto &Entry : Map) {
+    const SmallVecInsn &V = Entry.second;
+    if (V.size() < 2) {
+      ToErase.push_back(Entry.first);
+      continue;
+    }
+
+    // Only sink to common post-dom
+    SmallVecInsn InstructionsToSink;
+    for (Instruction *I : V) {
+      auto BB = I->getParent();
+      // All the sinkable instructions are collected after any barrier in
+      // a basic block so no need to check for barrier BBs.
+
+      // The instruction should have only one user i.e., PHI in the Succ
+      // TODO: Handle store, calls etc. without users
+      unsigned NumUsers = std::distance(I->user_begin(), I->user_end());
+      if (NumUsers != 1)
         continue;
-      if (K == InsKind::Scalar) {
-        if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
-          Safe.push_back(CHI);
-      } else {
-        MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
-        if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
-          Safe.push_back(CHI);
+      // The only user should be a PHI.
+      if (!isa<PHINode>(I->user_back()))
+        continue;
+
+      // Sink to single successor.
+      if (auto Succ = BB->getSingleSuccessor()) {
+        if (I->user_back()->getParent() != Succ)
+          continue; // PHI should be in the immediate successor.
+        // The successor should have exactly two predecessors.
+        unsigned NumPreds = std::distance(pred_begin(Succ), pred_end(Succ));
+        if (NumPreds != 2)
+          continue;
+
+        // Sinkable instruction found, check for safety and profitability.
+        // As the instructions are from a basic block without barriers, we can
+        // sink this instruction as long as there is no hazard. Scalars are
+        // safe to sink.
+        if (K != InsKind::Scalar && !safeToSinkToEnd(I, BB, K))
+          continue;
+        if (!profitableToSink(I))
+          continue;
+
+        // We don't need to check for barriers here because the instruction
+        // will be sunk at the beginning of a basic block i.e., before any
+        // barrier, and I is after any barriers in I->getParent().
+        if (!hasEH(Succ))
+          InstructionsToSink.push_back(I);
       }
     }
+
+    // assert(InstructionsToSink.size() < 3 && "Test case");
+    // Sort V such that adjacent Instructions share common PDom.
+    if (InstructionsToSink.size() != 2)
+      continue;
+    // assert(InstructionsToSink.size() != 2 && "Test case");
+    // There should be a unique PHI using I0 and I1 to be legally sinkable.
+    if (InstructionsToSink[0]->user_back() ==
+        InstructionsToSink[1]->user_back()) {
+      auto Succ0 = InstructionsToSink[0]->getParent()->getSingleSuccessor();
+      auto Succ1 = InstructionsToSink[1]->getParent()->getSingleSuccessor();
+      assert(Succ0 == Succ1);
+      HPL.push_back({Succ0, InstructionsToSink});
+    }
+    ToErase.push_back(Entry.first);
   }
 
-  using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
+  // Erase VNs already handled.
+  for (auto V : ToErase)
+    Map.erase(V);
+}
 
-  // Push all the VNs corresponding to BB into RenameStack.
-  void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+// In the inverse CFG, the dominance frontier of basic block (BB) is the
+// point where ANTIC needs to be computed for instructions which are going
+// to be hoisted. Since this point does not change during gvn-hoist,
+// we compute it only once (on demand).
+// The ides is inspired from:
+// "Partial Redundancy Elimination in SSA Form"
+// ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
+// They use similar idea in the forward graph to to find fully redundant and
+// partially redundant expressions, here it is used in the inverse graph to
+// find fully anticipable instructions at merge point (post-dominator in
+// the inverse CFG).
+// Returns the edge via which an instruction in BB will get the values from.
+
+// Returns true when the values are flowing out to each edge.
+bool GVNHoist::valueAnticipable(CHIArgs C, Instruction *TI) const {
+  if (TI->getNumSuccessors() > (unsigned)size(C))
+      return false; // Not enough args in this CHI.
+
+  for (const CHIArg &CHI : C) {
+    const BasicBlock *Dest = CHI.Dest;
+    // Find if all the edges have values flowing out of BB.
+    // This is clearly quadratic here. May need more efficient algorithm.
+    // Because it happens for limited CHIs it is 'okay' for now.
+    // TODO: To make it more efficient, collect all successors(TI) and all
+    // CHI.Dest in separate containers in sorted manner, and find a mismatch
+    // to get the same result as the following.
+    bool Found = any_of(successors(TI),
+                      [Dest](const BasicBlock *BB) { return BB == Dest; });
+    if (!Found)
+      return false;
+  }
+  return true;
+}
+
+// Check if it is safe to hoist values tracked by CHI in the range
+// [Begin, End) and accumulate them in Safe.
+void GVNHoist::checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
+                   SmallVectorImpl<CHIArg> &Safe) {
+  int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+  for (auto CHI : C) {
+    Instruction *Insn = CHI.I;
+    if (!Insn) // No instruction was inserted in this CHI.
+      continue;
+    if (K == InsKind::Scalar) {
+      if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
+        Safe.push_back(CHI);
+    } else {
+      MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
+      // Use safeToHoistLdSt_0
+      if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
+        Safe.push_back(CHI);
+    }
+  }
+}
+
+// Push all the VNs corresponding to BB into RenameStack.
+void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
                        RenameStackType &RenameStack) {
     auto it1 = ValueBBs.find(BB);
     if (it1 != ValueBBs.end()) {
@@ -625,9 +1233,9 @@
         RenameStack[VI.first].push_back(VI.second);
       }
     }
-  }
+}
 
-  void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
                    RenameStackType &RenameStack) {
     // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
     for (auto Pred : predecessors(BB)) {
@@ -661,13 +1269,13 @@
           ++It;
       }
     }
-  }
+}
 
-  // Walk the post-dominator tree top-down and use a stack for each value to
-  // store the last value you see. When you hit a CHI from a given edge, the
-  // value to use as the argument is at the top of the stack, add the value to
-  // CHI and pop.
-  void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
+// Walk the post-dominator tree top-down and use a stack for each value to
+// store the last value you see. When you hit a CHI from a given edge, the
+// value to use as the argument is at the top of the stack, add the value to
+// CHI and pop.
+void GVNHoist::insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
     auto Root = PDT->getNode(nullptr);
     if (!Root)
       return;
@@ -684,148 +1292,153 @@
       // Fill outgoing values in each CHI corresponding to BB.
       fillChiArgs(BB, CHIBBs, RenameStack);
     }
-  }
-
-  // Walk all the CHI-nodes to find ones which have a empty-entry and remove
-  // them Then collect all the instructions which are safe to hoist and see if
-  // they form a list of anticipable values. OutValues contains CHIs
-  // corresponding to each basic block.
-  void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
-                               HoistingPointList &HPL) {
-    auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
-
-    // CHIArgs now have the outgoing values, so check for anticipability and
-    // accumulate hoistable candidates in HPL.
-    for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
-      BasicBlock *BB = A.first;
-      SmallVectorImpl<CHIArg> &CHIs = A.second;
-      // Vector of PHIs contains PHIs for different instructions.
-      // Sort the args according to their VNs, such that identical
-      // instructions are together.
-      llvm::stable_sort(CHIs, cmpVN);
-      auto TI = BB->getTerminator();
-      auto B = CHIs.begin();
-      // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
-      auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
-                                 [B](CHIArg &A) { return A != *B; });
-      auto PrevIt = CHIs.begin();
-      while (PrevIt != PHIIt) {
-        // Collect values which satisfy safety checks.
-        SmallVector<CHIArg, 2> Safe;
-        // We check for safety first because there might be multiple values in
-        // the same path, some of which are not safe to be hoisted, but overall
-        // each edge has at least one value which can be hoisted, making the
-        // value anticipable along that path.
-        checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
-
-        // List of safe values should be anticipable at TI.
-        if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
-          HPL.push_back({BB, SmallVecInsn()});
-          SmallVecInsn &V = HPL.back().second;
-          for (auto B : Safe)
-            V.push_back(B.I);
-        }
+}
 
-        // Check other VNs
-        PrevIt = PHIIt;
-        PHIIt = std::find_if(PrevIt, CHIs.end(),
-                             [PrevIt](CHIArg &A) { return A != *PrevIt; });
+// Walk all the CHI-nodes to find ones which have a empty-entry and remove
+// them Then collect all the instructions which are safe to hoist and see if
+// they form a list of anticipable values. OutValues contains CHIs
+// corresponding to each basic block.
+void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
+                                       HoistingPointList &HPL) {
+  auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
+
+  // CHIArgs now have the outgoing values, so check for anticipability and
+  // accumulate hoistable candidates in HPL.
+  for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
+    BasicBlock *BB = A.first;
+    SmallVectorImpl<CHIArg> &CHIs = A.second;
+    // Vector of CHIs contains CHIs for different instructions.
+    // Sort the args according to their VNs, such that identical
+    // instructions are together.
+    llvm::stable_sort(CHIs, cmpVN);
+    auto TI = BB->getTerminator();
+    auto B = CHIs.begin();
+    // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
+    auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
+                               [B](CHIArg &A) { return A != *B; });
+    auto PrevIt = CHIs.begin();
+    while (PrevIt != PHIIt) {
+      // Collect values which satisfy safety checks.
+      SmallVector<CHIArg, 2> Safe;
+      // We check for safety first because there might be multiple values in
+      // the same path, some of which are not safe to be hoisted, but overall
+      // each edge has at least one value which can be hoisted, making the
+      // value anticipable along that path.
+      checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
+
+      // List of safe values should be anticipable at TI.
+      auto SR = make_range(Safe.begin(), Safe.end());
+      if (valueAnticipable(SR, TI) && profitableToHoist(SR)) {
+        HPL.push_back({BB, SmallVecInsn()});
+        SmallVecInsn &V = HPL.back().second;
+        for (auto B : Safe)
+          V.push_back(B.I);
       }
+
+      // Check other VNs
+      PrevIt = PHIIt;
+      PHIIt = std::find_if(PrevIt, CHIs.end(),
+                           [PrevIt](CHIArg &A) { return A != *PrevIt; });
     }
   }
+}
 
-  // Compute insertion points for each values which can be fully anticipated at
-  // a dominator. HPL contains all such values.
-  void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
-                              InsKind K) {
-    // Sort VNs based on their rankings
-    std::vector<VNType> Ranks;
-    for (const auto &Entry : Map) {
-      Ranks.push_back(Entry.first);
-    }
+// Compute insertion points for each values which can be fully anticipated at
+// a dominator. HPL contains all such values.
+void GVNHoist::computeInsertionPoints(VNtoInsns &Map,
+                                      HoistingPointList &HPL, InsKind K) {
+  // Sort VNs based on their rankings
+  std::vector<VNType> Ranks;
+  for (const auto &Entry : Map) {
+    Ranks.push_back(Entry.first);
+  }
 
-    // TODO: Remove fully-redundant expressions.
-    // Get instruction from the Map, assume that all the Instructions
-    // with same VNs have same rank (this is an approximation).
-    llvm::sort(Ranks, [this, &Map](const VNType &r1, const VNType &r2) {
-      return (rank(*Map.lookup(r1).begin()) < rank(*Map.lookup(r2).begin()));
-    });
-
-    // - Sort VNs according to their rank, and start with lowest ranked VN
-    // - Take a VN and for each instruction with same VN
-    //   - Find the dominance frontier in the inverse graph (PDF)
-    //   - Insert the chi-node at PDF
-    // - Remove the chi-nodes with missing entries
-    // - Remove values from CHI-nodes which do not truly flow out, e.g.,
-    //   modified along the path.
-    // - Collect the remaining values that are still anticipable
-    SmallVector<BasicBlock *, 2> IDFBlocks;
-    ReverseIDFCalculator IDFs(*PDT);
-    OutValuesType OutValue;
-    InValuesType InValue;
-    for (const auto &R : Ranks) {
-      const SmallVecInsn &V = Map.lookup(R);
-      if (V.size() < 2)
-        continue;
-      const VNType &VN = R;
-      SmallPtrSet<BasicBlock *, 2> VNBlocks;
-      for (auto &I : V) {
-        BasicBlock *BBI = I->getParent();
-        if (!hasEH(BBI))
-          VNBlocks.insert(BBI);
-      }
-      // Compute the Post Dominance Frontiers of each basic block
-      // The dominance frontier of a live block X in the reverse
-      // control graph is the set of blocks upon which X is control
-      // dependent. The following sequence computes the set of blocks
-      // which currently have dead terminators that are control
-      // dependence sources of a block which is in NewLiveBlocks.
-      IDFs.setDefiningBlocks(VNBlocks);
-      IDFBlocks.clear();
-      IDFs.calculate(IDFBlocks);
-
-      // Make a map of BB vs instructions to be hoisted.
+  // TODO: Remove fully-redundant expressions.
+  // Get instruction from the Map, assume that all the Instructions
+  // with same VNs have same rank (this is an approximation).
+  llvm::sort(Ranks, [this, &Map](const VNType &r1, const VNType &r2) {
+    return (rank(*Map.lookup(r1).begin()) < rank(*Map.lookup(r2).begin()));
+  });
+
+  // - Sort VNs according to their rank, and start with lowest ranked VN
+  // - Take a VN and for each instruction with same VN
+  //   - Find the dominance frontier in the inverse graph (PDF)
+  //   - Insert the chi-node at PDF
+  // - Remove the chi-nodes with missing entries
+  // - Remove values from CHI-nodes which do not truly flow out, e.g.,
+  //   modified along the path.
+  // - Collect the remaining values that are still anticipable
+  SmallVector<BasicBlock *, 2> IDFBlocks;
+  ReverseIDFCalculator IDFs(*PDT);
+  OutValuesType OutValue;
+  InValuesType InValue;
+  SmallVector<VNtoInsns::key_type, 4> ToErase;
+  for (const auto &R : Ranks) {
+    const SmallVecInsn &V = Map.lookup(R);
+    ToErase.push_back(R);
+    if (V.size() < 2)
+      continue;
+    const VNType &VN = R;
+    SmallPtrSet<BasicBlock *, 2> VNBlocks;
+    for (auto &I : V) {
+      BasicBlock *BBI = I->getParent();
+      if (!hasEH(BBI))
+        VNBlocks.insert(BBI);
+    }
+    // Compute the Post Dominance Frontiers of each basic block
+    // The dominance frontier of a live block X in the reverse
+    // control graph is the set of blocks upon which X is control
+    // dependent. The following sequence computes the set of blocks
+    // which currently have dead terminators that are control
+    // dependence sources of a block which is in NewLiveBlocks.
+    IDFs.setDefiningBlocks(VNBlocks);
+    IDFBlocks.clear();
+    IDFs.calculate(IDFBlocks);
+
+    // Make a map of BB vs instructions to be hoisted.
+    for (unsigned i = 0; i < V.size(); ++i) {
+      InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
+    }
+    // Insert empty CHI node for this VN. This is used to factor out
+    // basic blocks where the ANTIC can potentially change.
+    for (auto IDFB : IDFBlocks) {
       for (unsigned i = 0; i < V.size(); ++i) {
-        InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
-      }
-      // Insert empty CHI node for this VN. This is used to factor out
-      // basic blocks where the ANTIC can potentially change.
-      for (auto IDFB : IDFBlocks) {
-        for (unsigned i = 0; i < V.size(); ++i) {
-          CHIArg C = {VN, nullptr, nullptr};
-           // Ignore spurious PDFs.
-          if (DT->properlyDominates(IDFB, V[i]->getParent())) {
-            OutValue[IDFB].push_back(C);
-            LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
-                              << ", for Insn: " << *V[i]);
-          }
+        CHIArg C = {VN, nullptr, nullptr};
+         // Ignore spurious PDFs.
+        if (DT->properlyDominates(IDFB, V[i]->getParent())) {
+          OutValue[IDFB].push_back(C);
+          LLVM_DEBUG(dbgs() << "\nInserting a CHI for BB: " << IDFB->getName()
+                            << ", for Insn: " << *V[i]);
         }
       }
     }
-
-    // Insert CHI args at each PDF to iterate on factored graph of
-    // control dependence.
-    insertCHI(InValue, OutValue);
-    // Using the CHI args inserted at each PDF, find fully anticipable values.
-    findHoistableCandidates(OutValue, K, HPL);
   }
 
-  // Return true when all operands of Instr are available at insertion point
-  // HoistPt. When limiting the number of hoisted expressions, one could hoist
-  // a load without hoisting its access function. So before hoisting any
-  // expression, make sure that all its operands are available at insert point.
-  bool allOperandsAvailable(const Instruction *I,
-                            const BasicBlock *HoistPt) const {
-    for (const Use &Op : I->operands())
-      if (const auto *Inst = dyn_cast<Instruction>(&Op))
-        if (!DT->dominates(Inst->getParent(), HoistPt))
-          return false;
+  // Erase VNs already handled.
+  for (auto V : ToErase)
+    Map.erase(V);
 
-    return true;
-  }
+  // Insert CHI args at each PDF to iterate on factored graph of
+  // control dependence.
+  insertCHI(InValue, OutValue);
+  // Using the CHI args inserted at each PDF, find fully anticipable values.
+  findHoistableCandidates(OutValue, K, HPL);
+}
 
-  // Same as allOperandsAvailable with recursive check for GEP operands.
-  bool allGepOperandsAvailable(const Instruction *I,
+bool
+GVNHoist::allOperandsAvailable(Instruction *I,
+                               BasicBlock *NewDest,
+                               SmallVecImplVal &Unav) const {
+  for (Use &Op : I->operands())
+    if (auto *Inst = dyn_cast<Instruction>(&Op))
+      if (!DT->dominates(Inst->getParent(), NewDest))
+        Unav.push_back(Op.get());
+
+  return Unav.empty();
+}
+
+// Same as allOperandsAvailable with recursive check for GEP operands.
+bool GVNHoist::allGepOperandsAvailable(const Instruction *I,
                                const BasicBlock *HoistPt) const {
     for (const Use &Op : I->operands())
       if (const auto *Inst = dyn_cast<Instruction>(&Op))
@@ -842,14 +1455,13 @@
           }
         }
     return true;
-  }
+}
 
-  // Make all operands of the GEP available.
-  void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+// Make all operands of the GEP available.
+void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
                          const SmallVecInsn &InstructionsToHoist,
                          Instruction *Gep) const {
-    assert(allGepOperandsAvailable(Gep, HoistPt) &&
-           "GEP operands not available");
+  assert(allGepOperandsAvailable(Gep, HoistPt) && "GEP operands not available");
 
     Instruction *ClonedGep = Gep->clone();
     for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
@@ -885,9 +1497,9 @@
 
     // Replace uses of Gep with ClonedGep in Repl.
     Repl->replaceUsesOfWith(Gep, ClonedGep);
-  }
+}
 
-  void updateAlignment(Instruction *I, Instruction *Repl) {
+void GVNHoist::updateAlignment(Instruction *I, Instruction *Repl) {
     if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
       ReplacementLoad->setAlignment(MaybeAlign(std::min(
           ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment())));
@@ -904,11 +1516,11 @@
     } else if (isa<CallInst>(Repl)) {
       ++NumCallsRemoved;
     }
-  }
+}
 
-  // Remove all the instructions in Candidates and replace their usage with Repl.
-  // Returns the number of instructions removed.
-  unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+// Remove all the instructions in Candidates and replace their usage with Repl.
+// Returns the number of instructions removed.
+unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
                 MemoryUseOrDef *NewMemAcc) {
     unsigned NR = 0;
     for (Instruction *I : Candidates) {
@@ -931,10 +1543,10 @@
       }
     }
     return NR;
-  }
+}
 
-  // Replace all Memory PHI usage with NewMemAcc.
-  void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
+// Replace all Memory PHI usage with NewMemAcc.
+void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
     SmallPtrSet<MemoryPhi *, 4> UsePhis;
     for (User *U : NewMemAcc->users())
       if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
@@ -947,11 +1559,56 @@
         MSSAUpdater->removeMemoryAccess(Phi);
       }
     }
+}
+
+void GVNHoist::removeAndReplace(Instruction *I, Instruction *Repl,
+                                MemoryAccess *NewMemAcc) {
+  updateAlignment(I, Repl);
+  if (NewMemAcc) {
+    // Update the uses of the old MSSA access with NewMemAcc.
+    MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+    OldMA->replaceAllUsesWith(NewMemAcc);
+    MSSAUpdater->removeMemoryAccess(OldMA);
   }
 
-  // Remove all other instructions and replace them with Repl.
-  unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
-                            BasicBlock *DestBB, bool MoveAccess) {
+  Repl->andIRFlags(I);
+  combineKnownMetadata(Repl, I);
+  I->replaceAllUsesWith(Repl);
+  // Also invalidate the Alias Analysis cache.
+  MD->removeInstruction(I);
+  I->eraseFromParent();
+}
+
+void GVNHoist::removeAndReplace(const SmallVecInsn &InstructionsToHoist,
+                                Instruction *Repl) {
+  MemoryAccess *NewMemAcc = MSSA->getMemoryAccess(Repl);
+  // Remove and rename all other instructions.
+  for (Instruction *I : InstructionsToHoist)
+    if (I != Repl) {
+      removeAndReplace(I, Repl, NewMemAcc);
+    }
+}
+
+void GVNHoist::removeMPhi(MemoryAccess *NewMemAcc) {
+  // Remove MemorySSA phi nodes with the same arguments.
+  SmallPtrSet<MemoryPhi *, 4> UsePhis;
+  for (User *U : NewMemAcc->users())
+    if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+      UsePhis.insert(Phi);
+
+  for (auto *Phi : UsePhis) {
+    auto In = Phi->incoming_values();
+    if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+      Phi->replaceAllUsesWith(NewMemAcc);
+      MSSAUpdater->removeMemoryAccess(Phi);
+    }
+  }
+}
+
+// Remove all other instructions and replace them with Repl.
+unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
+                                    Instruction *Repl, BasicBlock *DestBB,
+                                    bool MoveAccess) {
     MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
     if (MoveAccess && NewMemAcc) {
         // The definition of this ld/st will not change: ld/st hoisting is
@@ -966,12 +1623,13 @@
     if (NewMemAcc)
       raMPHIuw(NewMemAcc);
     return NR;
-  }
+}
 
-  // In the case Repl is a load or a store, we make all their GEPs
-  // available: GEPs are not hoisted by default to avoid the address
-  // computations to be hoisted without the associated load or store.
-  bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+// In the case Repl is a load or a store, we make all their GEPs
+// available: GEPs are not hoisted by default to avoid the address
+// computations to be hoisted without the associated load or store.
+bool GVNHoist::makeGepOperandsAvailable(
+    Instruction *Repl, BasicBlock *HoistPt,
                                 const SmallVecInsn &InstructionsToHoist) const {
     // Check whether the GEP of a ld/st can be synthesized at HoistPt.
     GetElementPtrInst *Gep = nullptr;
@@ -1002,9 +1660,9 @@
       makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
 
     return true;
-  }
+}
 
-  std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
+std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
     unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
     for (const HoistingPointInfo &HP : HPL) {
       // Find out whether we already have one of the instructions in HoistPt,
@@ -1025,7 +1683,8 @@
       bool MoveAccess = true;
       if (Repl) {
         // Repl is already in HoistPt: it remains in place.
-        assert(allOperandsAvailable(Repl, DestBB) &&
+        SmallVecVal V;
+        assert(allOperandsAvailable(Repl, DestBB, V) &&
                "instruction depends on operands that are not available");
         MoveAccess = false;
       } else {
@@ -1036,7 +1695,8 @@
         // We can move Repl in HoistPt only when all operands are available.
         // The order in which hoistings are done may influence the availability
         // of operands.
-        if (!allOperandsAvailable(Repl, DestBB)) {
+        SmallVecVal V;
+        if (!allOperandsAvailable(Repl, DestBB, V)) {
           // When HoistingGeps there is nothing more we can do to make the
           // operands available: just continue.
           if (HoistingGeps)
@@ -1073,11 +1733,11 @@
     NumStoresHoisted += NS;
     NumCallsHoisted += NC;
     return {NI, NL + NC + NS};
-  }
+}
 
-  // Hoist all expressions. Returns Number of scalars hoisted
-  // and number of non-scalars hoisted.
-  std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
+/*/ Hoist all expressions. Returns Number of scalars hoisted
+// and number of non-scalars hoisted.
+std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
     InsnInfo II;
     LoadInfo LI;
     StoreInfo SI;
@@ -1135,8 +1795,376 @@
     computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
     computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
     return hoist(HPL);
+}*/
+
+// Update MemorySSA if mem-refs are sunk.
+std::pair<unsigned, unsigned> GVNHoist::sink(GVNHoist::HoistingPointList &HPL) {
+  unsigned NI = 0, NL = 0, NS = 0, NC = 0;
+
+  std::sort(HPL.begin(), HPL.end(), [this](const HoistingPointInfo &HP1,
+                                       const HoistingPointInfo &HP2) {
+      // Sort in descending order to have last executing insn first in HPL.
+      return !DFSInOrder(HP1.second[0], HP2.second[0]);
+  });
+
+  for (const HoistingPointInfo &HP : HPL) {
+    BasicBlock *SinkPt = HP.first;
+    const SmallVecInsn &InstructionsToHoist = HP.second;
+    assert(InstructionsToHoist.size() == 2);
+    Instruction *I0 = InstructionsToHoist[0];
+    Instruction *I1 = InstructionsToHoist[1];
+
+    // We need to check availability for I0 and I1. TODO: Additionally, we can
+    // still sink if the operand not available is also getting sunk will
+    // available operands.
+    SmallVecVal I0UnavOps, I1UnavOps;
+    allOperandsAvailable(I0, SinkPt, I0UnavOps);
+    allOperandsAvailable(I1, SinkPt, I1UnavOps);
+
+    // Sink only when all operands are available, or only one is unavailable.
+    if (I0UnavOps.size() != I1UnavOps.size() || I1UnavOps.size() > 1)
+      continue;
+
+    if (I0UnavOps.size() == 1) {
+      // Check that the unavailable operand is safe to sink to PHI.
+      if (lastUser(I0, I0UnavOps[0]) != I0 || lastUser(I1, I1UnavOps[0]) != I1)
+        continue;
+    }
+    // Keep I0 and remove I1 and PN
+    PHINode *PN = cast<PHINode>(I0->user_back());
+    MemoryAccess *I0MemAccess = MSSA->getMemoryAccess(I0);
+    LLVM_DEBUG(dbgs() << "\nSinking:" << *I0 << *I1 << *PN);
+
+    updateAlignment(I1, I0);
+    I0->andIRFlags(I1);
+    combineKnownMetadata(I0, I1);
+    // I1->replaceAllUsesWith(I0); Not required because I1 used once in PHI.
+    // Also invalidate the Alias Analysis cache.
+    MD->removeInstruction(I1);
+    MD->removeInstruction(PN);
+    PN->replaceAllUsesWith(I0);
+    // FIXME: Update Memory SSA
+    // Update the uses of the old MSSA access with I0.
+    // Remove MSSA for I1
+    // Remove MSSA for PN, and replace with I0
+    // Combine metadata of I0 and I1
+    // Memory PHI?
+    // assert(0);
+    bool found = false;
+    auto It = SinkPt->begin();
+    while (It != SinkPt->end()) { // Find the last PHI to insert I0
+      if (!isa<PHINode>(It))
+        break;
+      if (PN == &*It)
+        found = true;
+      ++It;
+    }
+    assert(found && "PHI replaced in wrong BB");
+    BasicBlock *I0BB = I0->getParent();
+    I0->moveBefore(*SinkPt, It);
+
+    if (I0UnavOps.empty())
+      PN->eraseFromParent();
+    else {
+      Value* V = I0UnavOps[0];
+      PHINode *NewPN = PHINode::Create(V->getType(), 2, "", PN);
+      PN->eraseFromParent();
+      // Move the incoming operands in this PHI.
+      NewPN->addIncoming(I0UnavOps[0], I0BB);
+      NewPN->addIncoming(I1UnavOps[0], I1->getParent());
+      bool OpReplaced = false;
+      for (int i = 0, e = I0->getNumOperands(); i != e; ++i)
+        if (I0->getOperand(i) == I0UnavOps[0]) {
+          I0->setOperand(i, NewPN);
+          OpReplaced = true;
+        }
+      assert(OpReplaced && "Illegal transformation");
+    }
+
+    if (I0MemAccess) { // I0, I1 are mem-refs
+      assert(MSSA->getMemoryAccess(I1));
+      // This MSSA update is only for mem-refs with one use in a PHI in Succ BB.
+      MemoryAccess *I1MemAccess = MSSA->getMemoryAccess(I1);
+      MemoryUseOrDef *I0UD = cast<MemoryUseOrDef>(I0MemAccess);
+      MemoryUseOrDef *I1UD = cast<MemoryUseOrDef>(I1MemAccess);
+
+      MemoryAccess *I0Def = I0UD->getDefiningAccess();
+      MemoryAccess *I1Def = I1UD->getDefiningAccess();
+
+      MemoryPhi *NewPHI = MSSA->getMemoryAccess(SinkPt);
+      if (!NewPHI) {
+        NewPHI = MSSA->createMemoryPhi(SinkPt);
+        NewPHI->addIncoming(I0Def, I0->getParent());
+        NewPHI->addIncoming(I1Def, I1->getParent());
+      }
+      I0UD->setDefiningAccess(NewPHI);
+      MSSAUpdater->removeMemoryAccess(I0MemAccess);
+      MSSAUpdater->removeMemoryAccess(I1MemAccess);
+    }
+
+    I1->eraseFromParent();
+    NumSunk++;
+    if (isa<LoadInst>(I0))
+      ++NL;
+    else if (isa<StoreInst>(I0))
+      ++NS;
+    else if (isa<CallInst>(I0))
+      ++NC;
+    else // Scalar
+      ++NI;
   }
-};
+  return {NI, NL + NC + NS};
+}
+
+bool GVNHoist::getVN(Instruction *I) {
+  // Do not value number terminator instructions.
+  if (I->isTerminator())
+    return false;
+
+  if (auto *Load = dyn_cast<LoadInst>(I))
+    LI.insert(Load, VN);
+  else if (auto *Store = dyn_cast<StoreInst>(I))
+    SI.insert(Store, VN);
+  else if (auto *Call = dyn_cast<CallInst>(I)) {
+    if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
+      if (isa<DbgInfoIntrinsic>(Intr) ||
+          Intr->getIntrinsicID() == Intrinsic::assume ||
+          Intr->getIntrinsicID() == Intrinsic::sideeffect)
+        return true;
+    }
+    if (Call->mayHaveSideEffects())
+      return false;
+    if (Call->isConvergent())
+      return false;
+    CI.insert(Call, VN);
+  } else if (HoistingGeps || !isa<GetElementPtrInst>(I))
+    // Do not hoist scalars past calls that may write to memory because
+    // that could result in spills later. geps are handled separately.
+    // TODO: We can relax this for targets like AArch64 as they have more
+    // registers than X86.
+    II.insert(I, VN);
+
+  return true;
+}
+
+void GVNHoist::barriersAndVNs(Function &F, bool Sinking) {
+  for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+    int InstructionNb = 0;
+    bool BarrierFound = false;
+    SmallVector<std::pair<unsigned, unsigned>, 4> CurrentBBVNs;
+    for (Instruction &I1 : *BB) {
+      // If I1 cannot guarantee progress, subsequent instructions
+      // in BB cannot be hoisted anyways.
+      // TODO: For sink operation anything after hoist barrier is okay to sink
+      // but nothing before the hoist barrier.
+      if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+        // NumBarriers++;
+        HoistBarrier.insert(BB);
+        BarrierFound = true;
+        break;
+      }
+
+      // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
+      // deeper may increase the register pressure and compilation time.
+      if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
+        break;
+
+      if (!getVN(&I1))
+        break;
+    }
+    if (Sinking && BarrierFound) {
+      // TODO: clear VNs for this basicblock.
+      // Instructions after the last barriers can be sunk.
+      for (BasicBlock::reverse_iterator It = BB->rbegin(), Ie = BB->rend();
+           It != Ie; ++It) {
+        if (!isGuaranteedToTransferExecutionToSuccessor(&*It))
+          break;
+        if (!getVN(&*It))
+          break;
+      }
+    }
+  }
+}
+
+std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
+  barriersAndVNs(F, false);
+  HoistingPointList HPL;
+  computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
+  computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
+  computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
+  computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+  computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
+  computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
+  return hoist(HPL);
+}
+
+std::pair<unsigned, unsigned> GVNHoist::sinkExpressions(Function &F) {
+  barriersAndVNs(F, false); // First clear VNs before barrier in that BB. see
+                            // TODO then set it to true.
+  std::pair<unsigned, unsigned> Count;
+  bool ContinueSink = false;
+  unsigned MaxSinkIter = 3;
+  unsigned SinkIter = 0;
+  do {
+  HoistingPointList HPL;
+  findSinkableInsn(II.getVNTable(), HPL, InsKind::Scalar);
+  findSinkableInsn(LI.getVNTable(), HPL, InsKind::Load);
+  findSinkableInsn(SI.getVNTable(), HPL, InsKind::Store);
+  findSinkableInsn(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+  findSinkableInsn(CI.getLoadVNTable(), HPL, InsKind::Load);
+  findSinkableInsn(CI.getStoreVNTable(), HPL, InsKind::Store);
+  // FIXME: Clear items from VNTable once they are hoisted/sink
+  auto ThisCount = sink(HPL);
+  Count.first += ThisCount.first;
+  Count.second += ThisCount.second;
+  unsigned SunkCount = ThisCount.first + ThisCount.second;
+  ++SinkIter;
+  ContinueSink = SunkCount > 0 ? MaxSinkIter > SinkIter : false;
+  } while (ContinueSink);
+  return Count;
+}
+
+void GVNHoist::constructDJGraph(DomLevelsT &DomLevels, DenseSet<EdgeT> &JEdges,
+                                BBLevelT &BBLevels) {
+  for (auto DFI = df_begin(DT->getRootNode()), DFE = df_end(DT->getRootNode());
+       DFI != DFE; ++DFI) {
+    // Since getPathLength is inclusive of both the terminal nodes
+    // i.e., Entry and *DFI so decrease by 1.
+    unsigned Depth = DFI.getPathLength() - 1;
+    BasicBlock *BB = (*DFI)->getBlock();
+    DomLevels[BB] = Depth;
+    BBLevels[Depth].push_back(BB);
+    for (BasicBlock *Succ : successors(BB))
+      if (!DT->properlyDominates(BB, Succ)) {
+        JEdges.insert(std::make_pair(BB, Succ));
+      }
+  }
+}
+
+bool GVNHoist::constructMergeSet(DomLevelsT &DomLevels, DenseSet<EdgeT> &JEdges,
+                                 BBLevelT &BBLevels) {
+  bool Repeat = false;
+  DenseSet<EdgeT> VisJEdges; // Visited J Edges.
+  unsigned PrevLev = 0;
+  for (std::pair<const unsigned, BBLevelKeyT> &P : BBLevels) {
+    assert(PrevLev <= P.first);
+    PrevLev = P.first;
+    for (BasicBlock *CurrBB : P.second) {
+      for (auto PB = pred_begin(CurrBB), PE = pred_end(CurrBB); PB != PE;
+           ++PB) {
+        EdgeT Edge(*PB, CurrBB); // For all incoming edges to CurrBB.
+        if (JEdges.count(Edge) && !VisJEdges.count(Edge)) {
+          VisJEdges.insert(Edge); // Visit
+          BasicBlock *Src = Edge.first;
+          BasicBlock *Dst = Edge.second;
+          BasicBlock *INode = nullptr;
+          MergeSet[Dst].insert(Dst);                 // The target of JEdge.
+          while (DomLevels[Src] >= DomLevels[Dst]) { // A backedge.
+            LLVM_DEBUG(dbgs() << "\nVisiting: " << Src->getName() << " -> "
+                         << Dst->getName());
+            // Merge (tmp) = Merge (tmp) U Merge (tnode) U { tnode }
+            // MergeSet(tnode) contains tnode.
+            MergeSet[Src].insert(MergeSet[Dst].begin(), MergeSet[Dst].end());
+            INode = Src;
+            LLVM_DEBUG(dbgs() << ", IDom of " << Src->getName() << " is ");
+            Src = DT->getNode(Src)->getIDom()->getBlock();
+            LLVM_DEBUG(dbgs() << Src->getName());
+          }
+          for (auto PINode = pred_begin(INode), PENode = pred_end(INode);
+               PINode != PENode; ++PINode) { // INode is an ancestor of SNode.
+            EdgeT Edge(*PINode, INode);
+            if (VisJEdges.count(Edge)) {
+              assert(JEdges.count(Edge));
+              BasicBlock *SNode = *PINode;
+              // Check inconsistency, MergeSet[Dest] subset of MergeSet[Src]
+              if (!isSubset(MergeSet[INode], MergeSet[SNode]))
+                Repeat = true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return Repeat;
+}
+
+const Instruction *GVNHoist::lastUser(const Instruction *I,
+                                      const Value *Val) const {
+  // TODO: Make isLiveOutUsingMergeSet take const parameters.
+  assert(!isLiveOutUsingMergeSet(const_cast<BasicBlock *>(I->getParent()),
+                                 const_cast<Value *>(Val)));
+  const BasicBlock *BB = I->getParent();
+  BasicBlock::const_iterator BI(I), BE = BB->end();
+  unsigned ICount = std::distance(BI, BE);
+  // If there are less #uses than ICount it is cheaper to iterate on Val->users.
+  if (Val->getNumUses() <= ICount) { // Iterate on uses
+    for (const User *U : Val->users()) {
+      const Instruction *UserI = cast<Instruction>(U);
+      if (UserI != I && UserI->getParent() == BB) {
+        if (firstInBB(I, UserI)) // I precedes another Use => not a kill.
+          return UserI;
+      }
+    }
+    return I;
+  }
+  // else Iterate on Instructions
+  for (++BI; BI != BE; ++BI) {
+    for (unsigned i = 0; i < BI->getNumOperands(); ++i)
+      if (BI->getOperandUse(i).get() == Val)
+        return &*BI;
+  }
+  return I;
+}
+
+bool GVNHoist::isLiveOutUsingMergeSet(BasicBlock *BB, Value *Val) const {
+  assert(BB);
+  const BasicBlock *ValDefBB = nullptr; // BasicBlock defining Val
+  if (Instruction *I = dyn_cast<Instruction>(Val))
+    ValDefBB = I->getParent();
+  // FIXME!
+  // We are assuming when DefBB is not defined then the value is a parameter.
+
+  // Case when Val is defined in BB, if any of the use is outside BB (DefBB)
+  // then it must be live-out.
+  if (ValDefBB == BB)
+    for (User *U : Val->users()) {
+      if (cast<Instruction>(U)->getParent() != BB)
+        return true;
+    }
+
+  // Mr(n) = M(n) U {n}; Create a new set from the merge set
+  // Ms(n) = Ms(n) U Mr(w); For each successor w of BB
+  SmallSetBB Ms; // Ms = null-set
+  for (BasicBlock *Succ : successors(BB)) {
+    Ms.insert(Succ); // Mr(Succ) = Succ U M(Succ)
+    for (BasicBlock *BB : MergeSet.lookup(Succ))
+      Ms.insert(BB); // M(Succ)
+  }
+
+  // Iterate over all the uses of Val, if any of its users is in the mergeset
+  // of \p BB then Val is LiveOut from BB.
+  for (User *U : Val->users()) {
+    BasicBlock *UserDefBB = nullptr;
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      UserDefBB = I->getParent();
+    else // Assuming live-out conservatively, the user can be a global
+      // TODO: maybe return false is okay??
+      return true; // llvm_unreachable("User is not an instruction.");
+    // For globals, skip the MergeSet checks.
+    if (UserDefBB->getParent() != BB->getParent())
+      continue;
+    while (UserDefBB && (UserDefBB != ValDefBB)) {
+      if (Ms.count(UserDefBB)) // if t /\ Ms(n) then return true;
+        return true;
+      auto N = DT->getNode(UserDefBB);
+      if (auto IDom = N->getIDom())
+        UserDefBB = IDom->getBlock();
+      else // Reached the beginning of the function => maybe a global.
+        return false; // Assume that global doesn't flow live-out from BB
+    }
+  }
+  return false;
+}
 
 class GVNHoistLegacyPass : public FunctionPass {
 public:
diff --git a/llvm/test/Transforms/GVNHoist/broken-mod-sink.ll b/llvm/test/Transforms/GVNHoist/broken-mod-sink.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/broken-mod-sink.ll
@@ -0,0 +1,100 @@
+; RUN: opt -S -gvn-hoist -gvn-hoist-check-profitability=false < %s | FileCheck %s
+
+; Check that all and, inttoptr, and bitcast is sunk.
+; CHECK-LAEBL: @baz0
+; CHECK-LABEL: bb9:
+; CHECK: and
+; CHECK: inttoptr
+; CHECK: bitcast
+
+
+; ModuleID = 'broken-mod.ll'
+
+%struct.pluto = type { i32 (...)**, i8, i8, i16, i32 }
+
+define void @baz0(i64 %a) {
+bb:
+  br i1 undef, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  %tmp = and i64 %a, -8
+  %tmp2 = inttoptr i64 %tmp to i8*
+  %tmp3 = bitcast i8* %tmp2 to %struct.pluto*
+  br label %bb9
+
+bb4:                                              ; preds = %bb
+  br i1 undef, label %bb5, label %bb11
+
+bb5:                                              ; preds = %bb4
+  %tmp6 = and i64 %a, -8
+  %tmp7 = inttoptr i64 %tmp6 to i8*
+  %tmp8 = bitcast i8* %tmp7 to %struct.pluto*
+  br label %bb9
+
+bb9:                                              ; preds = %bb5, %bb1
+  %tmp10 = phi %struct.pluto* [ %tmp8, %bb5 ], [ %tmp3, %bb1 ]
+  unreachable
+
+bb11:                                             ; preds = %bb4
+  ret void
+}
+
+; CHECK-LABEL: @baz
+; CHECK-LABEL: bb9:
+; CHECK: bitcast
+; Function Attrs: nounwind uwtable
+define void @baz() {
+bb:
+  %tmp = and i64 undef, -8
+  %tmp2 = inttoptr i64 %tmp to i8*
+  br i1 undef, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  %tmp3 = bitcast i8* %tmp2 to %struct.pluto*
+  br label %bb9
+
+bb4:                                              ; preds = %bb
+  br i1 undef, label %bb5, label %bb11
+
+bb5:                                              ; preds = %bb4
+  %tmp8 = bitcast i8* %tmp2 to %struct.pluto*
+  br label %bb9
+
+bb9:                                              ; preds = %bb5, %bb1
+  %tmp10 = phi %struct.pluto* [ %tmp8, %bb5 ], [ %tmp3, %bb1 ]
+  unreachable
+
+bb11:                                             ; preds = %bb4
+  ret void
+}
+
+; CHECK-LABEL: @baz1
+; CHECK-LABEL: bb9:
+; CHECK: inttoptr
+; CHECK: bitcast
+define void @baz1() {
+bb:
+  %tmp = and i64 undef, -8
+  br i1 undef, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  %tmp2 = inttoptr i64 %tmp to i8*
+  %tmp10 = bitcast i8* %tmp2 to %struct.pluto*
+  br label %bb9
+
+bb4:                                              ; preds = %bb
+  br i1 undef, label %bb5, label %bb11
+
+bb5:                                              ; preds = %bb4
+  %tmp3 = inttoptr i64 %tmp to i8*
+  %tmp8 = bitcast i8* %tmp3 to %struct.pluto*
+  br label %bb9
+
+bb9:                                              ; preds = %bb5, %bb1
+  %tmp11 = phi %struct.pluto* [ %tmp8, %bb5 ], [ %tmp10, %bb1 ]
+  unreachable
+
+bb11:                                             ; preds = %bb4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/GVNHoist/dj-edge-detect-das-taco.ll b/llvm/test/Transforms/GVNHoist/dj-edge-detect-das-taco.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/dj-edge-detect-das-taco.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -gvn-hoist -debug < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Testcase taken from:
+; Das, Dibyendu, B. Dupont De Dinechin, and Ramakrishna Upadrasta. "Efficient
+; liveness computation using merge sets and dj-graphs." ACM Transactions on
+; Architecture and Code Optimization (TACO) 8.4 (2012): 27.
+
+; check the J edges
+; CHECK: Found a JEdge: bb10 -> bb8
+; CHECK: Found a JEdge: bb4 -> bb5
+; CHECK: Found a JEdge: bb5 -> bb6
+; CHECK: Found a JEdge: bb6 -> bb5
+; CHECK: Found a JEdge: bb7 -> bb2
+; CHECK: Found a JEdge: bb9 -> bb6
+
+; check the mergesets
+; CHECK: MergeSet of: bb10: bb2, bb8, bb5, bb6,
+; CHECK: MergeSet of: bb2: bb2,
+; CHECK: MergeSet of: bb3: bb2,
+; CHECK: MergeSet of: bb4: bb2, bb5, bb6,
+; CHECK: MergeSet of: bb5: bb2, bb5, bb6,
+; CHECK: MergeSet of: bb6: bb2, bb5, bb6,
+; CHECK: MergeSet of: bb7: bb2,
+; CHECK: MergeSet of: bb8: bb2, bb8, bb5, bb6,
+; CHECK: MergeSet of: bb9: bb2, bb8, bb5, bb6,
+
+
+define void @irreducible_loop() {
+bb1:
+br label %bb2
+
+bb2:
+  br i1 undef, label %bb3, label %bb11
+
+bb3:
+  br i1 undef, label %bb4, label %bb8
+
+bb4:
+  br label %bb5
+
+bb5:
+  br label %bb6
+
+bb6:
+  br i1 undef, label %bb7, label %bb5
+
+bb7:
+  br label %bb2
+
+bb8:
+  br label %bb9
+
+bb9:
+  br i1 undef, label %bb6, label %bb10
+
+bb10:
+  br label %bb8
+
+bb11:
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/Transforms/GVNHoist/dj-edge-detect.ll b/llvm/test/Transforms/GVNHoist/dj-edge-detect.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/dj-edge-detect.ll
@@ -0,0 +1,108 @@
+; RUN: opt -S -gvn-hoist -debug < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; check the J edges
+; CHECK: Found a JEdge: bb10 -> bb18
+; CHECK: Found a JEdge: bb13 -> bb18
+; CHECK: Found a JEdge: bb15 -> bb18
+; CHECK: Found a JEdge: bb18 -> bb6
+; CHECK: Found a JEdge: bb2 -> bb3
+; CHECK: Found a JEdge: bb8 -> bb18
+
+
+; check the mergesets
+; CHECK: MergeSet of: bb10: bb3, bb6, bb18,
+; CHECK: MergeSet of: bb13: bb3, bb6, bb18,
+; CHECK: MergeSet of: bb15: bb3, bb6, bb18,
+; CHECK: MergeSet of: bb18: bb3, bb6, bb18,
+; CHECK: MergeSet of: bb2: bb3,
+; CHECK: MergeSet of: bb3: bb3,
+; CHECK: MergeSet of: bb6: bb3, bb6,
+; CHECK: MergeSet of: bb8: bb3, bb6, bb18,
+
+
+%0 = type { %1*, %8* }
+%1 = type { %2*, %3* }
+%2 = type { i32, i32, i32, i32, i32*, i8*, i32 }
+%3 = type { %4, i32, i32, %7*, i8, i8, i8, i8, i8 }
+%4 = type { i8, i32, i8*, %5 }
+%5 = type { %6 }
+%6 = type { i16 }
+%7 = type { i8*, i8*, i8*, i8*, i8* }
+%8 = type <{ i8*, i8*, i32, %9, %17*, %19* }>
+%9 = type { %10*, i8*, i8*, i8* (i8*)*, i32* (i8*)*, i8*, %15, i32, i32, i8*, %16*, i32, i8, i8*, i8, i8* }
+%10 = type { i8*, %11*, i8*, %12, %13*, i8*, i32 (i8*)*, i32, i8, %14 }
+%11 = type { i8*, i32 (i8*, %12*, i8*)*, i32 (i8*)*, i32 (i8*, i32)*, %16* (i8*)*, i32 (i8*)*, i32 (i8*)*, i32 (...)*, i32 (...)*, i32 (...)*, void (i8*, i32)*, void (i8*, i32, i8*, i32)*, i32 (i8*, i32, i8*)* }
+%12 = type { i8*, i32 (i8*, i32, i8*, i32, i8)*, i32 (i8*, i32, i8*)*, void (i8*, i8*, i32)*, i32 (i8*)*, i32 (i8*, i32, i32)* }
+%13 = type { i32 (i8*, i8*, i32)*, i32 (i8*, i32, i32)* }
+%14 = type { i32, i32, i8, i8, i8 }
+%15 = type { i32, i32, i32 }
+%16 = type { i16, i16, i32, i32 }
+%17 = type { i8*, i8*, i16 (i8*, i8*, i16)*, i32 (i8*, i8, i32)*, i32 (%16*, i32)*, %18, i32, i32 }
+%18 = type { i8*, i8* (i8*, i8*, i32)* }
+%19 = type { %20, i32, i32 }
+%20 = type { i32, i32, [8 x i8], i8, i8, i16 }
+
+@global = external hidden unnamed_addr constant [10 x i8], align 1
+
+define void @ham(i8* nocapture readnone %arg) local_unnamed_addr {
+bb:
+  %tmp = alloca %0*, align 8
+  %tmp1 = bitcast %0** %tmp to i8*
+  br label %bb3
+
+bb2:                                              ; preds = %bb6
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %tmp4 = load %0*, %0** %tmp, align 8, !tbaa !0
+  %tmp5 = call i32 @quux(i8* %tmp1)
+  br label %bb6
+
+bb6:                                              ; preds = %bb18, %bb3
+  %tmp7 = phi i32 [ %tmp5, %bb3 ], [ %tmp19, %bb18 ]
+  switch i32 %tmp7, label %bb15 [
+    i32 0, label %bb2
+    i32 35, label %bb8
+    i32 11, label %bb10
+    i32 12, label %bb13
+  ]
+
+bb8:                                              ; preds = %bb6
+  %tmp9 = load %0*, %0** %tmp, align 8, !tbaa !0
+  br label %bb18
+
+bb10:                                             ; preds = %bb6
+  %tmp11 = load %0*, %0** %tmp, align 8, !tbaa !0
+  %tmp12 = call i32 @quux(i8* %tmp1)
+  br label %bb18
+
+bb13:                                             ; preds = %bb6
+  %tmp14 = call i32 @quux(i8* %tmp1)
+  br label %bb18
+
+bb15:                                             ; preds = %bb6
+  %tmp16 = load %0*, %0** %tmp, align 8, !tbaa !0
+  %tmp17 = call i32 @quux(i8* null)
+  br label %bb18
+
+bb18:                                             ; preds = %bb15, %bb13, %bb10, %bb8
+  %tmp19 = phi i32 [ %tmp17, %bb15 ], [ %tmp14, %bb13 ], [ %tmp12, %bb10 ], [ 0, %bb8 ]
+  br label %bb6
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+declare i32 @zot(%0**)
+
+declare i32 @quux(i8*)
+
+declare i32 @wombat(i8* nocapture readonly, ...)
+
+attributes #0 = { argmemonly nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"any pointer", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/GVNHoist/gvn-sink-test-1.ll b/llvm/test/Transforms/GVNHoist/gvn-sink-test-1.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/gvn-sink-test-1.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -gvn-hoist -debug < %s
+; ModuleID = 'gvn-sink-test.ll'
+source_filename = "bugpoint-output-9bbed91.bc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.spam = type opaque
+
+declare void @bar() local_unnamed_addr #0
+
+; Function Attrs: noinline nounwind uwtable
+define fastcc void @wombat() unnamed_addr #1 {
+bb:
+  br i1 undef, label %bb1, label %bb10
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+bb3:                                              ; preds = %bb1
+  br i1 undef, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  %tmp = load %struct.spam*, %struct.spam** undef, align 8
+  br label %bb7
+
+bb5:                                              ; preds = %bb3
+  %tmp6 = load %struct.spam*, %struct.spam** undef, align 8
+  br label %bb7
+
+bb7:                                              ; preds = %bb5, %bb4
+  %tmp8 = phi %struct.spam* [ %tmp, %bb4 ], [ %tmp6, %bb5 ]
+  call void @bar() #2
+  %tmp9 = load %struct.spam*, %struct.spam** undef, align 8
+  unreachable
+
+bb10:                                             ; preds = %bb
+  ret void
+}
+
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 "}
diff --git a/llvm/test/Transforms/GVNHoist/hoist-convergent.ll b/llvm/test/Transforms/GVNHoist/hoist-convergent.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-convergent.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-convergent.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn-hoist -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -gvn-hoist-check-profitability=false -S < %s | FileCheck %s
 
 ; Check that convergent calls are not hoisted.
 ;
diff --git a/llvm/test/Transforms/GVNHoist/hoist-md.ll b/llvm/test/Transforms/GVNHoist/hoist-md.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-md.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-md.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+; RUN: opt -S -gvn-hoist -gvn-hoist-check-profitability=false < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/GVNHoist/hoist-mssa.ll b/llvm/test/Transforms/GVNHoist/hoist-mssa.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-mssa.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-mssa.ll
@@ -1,7 +1,10 @@
-; RUN: opt -S -gvn-hoist -newgvn < %s | FileCheck %s
+; RUN: opt -S -gvn-hoist -gvn-hoist-check-profitability=false -newgvn < %s | FileCheck %s
 
-; Check that store hoisting works: there should be only one store left.
-; CHECK-LABEL: @getopt
+; Check that store hoisting works: there should be only one store hoisted
+; Hoist load as well because the @optind global is killed in bb4 (at least once)
+; CHECK-LABEL: @getopt(
+; CHECK: load i32
+; CHECK: add nsw i32
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 
@@ -47,6 +50,51 @@
   ret void
 }
 
+; Check that store hoisting works: there should be only one store left because
+; load should also be hoisted in this case.
+; CHECK-LABEL: @getopt_hoist_load(
+; CHECK: store i32
+; CHECK-NOT: store i32
+define void @getopt_hoist_load(i32* %p1) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  br label %bb13
+
+bb3:                                              ; preds = %bb1
+  br i1 undef, label %bb4, label %bb9
+
+bb4:                                              ; preds = %bb3
+  %tmp = load i32, i32* @optind, align 4
+  br i1 undef, label %bb5, label %bb7
+
+bb5:                                              ; preds = %bb4
+  %tmp6 = add nsw i32 %tmp, 1
+  store i32 %tmp6, i32* %p1, align 4
+  br label %bb12
+
+bb7:                                              ; preds = %bb4
+  %tmp8 = add nsw i32 %tmp, 1
+  store i32 %tmp8, i32* %p1, align 4
+  br label %bb13
+
+bb9:                                              ; preds = %bb3
+  %tmp10 = load i32, i32* @optind, align 4
+  %tmp11 = add nsw i32 %tmp10, 1
+  store i32 %tmp11, i32* %p1, align 4
+  br label %bb12
+
+bb12:                                             ; preds = %bb9, %bb5
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb7, %bb2
+  ret void
+}
+
 @GlobalVar = internal global float 1.000000e+00
 
 ; Check that we hoist stores and remove the MSSA phi node.
diff --git a/llvm/test/Transforms/GVNHoist/hoist-pr20242.ll b/llvm/test/Transforms/GVNHoist/hoist-pr20242.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-pr20242.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-pr20242.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn-hoist -newgvn -gvn-hoist  -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -newgvn -gvn-hoist -gvn-hoist-check-profitability=false  -S < %s | FileCheck %s
 ; Test to demonstrate that newgvn creates opportunities for
 ; more gvn-hoist when sibling branches contain identical expressions.
 
diff --git a/llvm/test/Transforms/GVNHoist/hoist-pr22005.ll b/llvm/test/Transforms/GVNHoist/hoist-pr22005.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-pr22005.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-pr22005.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn-hoist -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -gvn-hoist-check-profitability=false -S < %s | FileCheck %s
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/GVNHoist/hoist-recursive-geps.ll b/llvm/test/Transforms/GVNHoist/hoist-recursive-geps.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-recursive-geps.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-recursive-geps.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn-hoist -newgvn -gvn-hoist -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -newgvn -gvn-hoist -gvn-hoist-check-profitability=false -S < %s | FileCheck %s
 
 ; Check that recursive GEPs are hoisted. Since hoisting creates
 ; fully redundant instructions, newgvn is run to remove them which then
diff --git a/llvm/test/Transforms/GVNHoist/hoist-very-busy.ll b/llvm/test/Transforms/GVNHoist/hoist-very-busy.ll
--- a/llvm/test/Transforms/GVNHoist/hoist-very-busy.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-very-busy.ll
@@ -53,3 +53,25 @@
   store i32 1, i32* @G
   ret void
 }
+
+
+; Check that the call and fcmp are hoisted.
+; CHECK-LABEL: define void @bun(
+; CHECK: store
+; CHECK-NOT: store
+
+define void @bun() {
+entry:
+  br label %if.then
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %sw0, label %sw1
+
+sw0:
+  store i32 1, i32* @G
+  unreachable
+
+sw1:
+  store i32 1, i32* @G
+  ret void
+}
diff --git a/llvm/test/Transforms/GVNHoist/hoist.ll b/llvm/test/Transforms/GVNHoist/hoist.ll
--- a/llvm/test/Transforms/GVNHoist/hoist.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn-hoist -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -gvn-hoist-check-profitability=false -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/GVNHoist/infinite-loop-direct.ll b/llvm/test/Transforms/GVNHoist/infinite-loop-direct.ll
--- a/llvm/test/Transforms/GVNHoist/infinite-loop-direct.ll
+++ b/llvm/test/Transforms/GVNHoist/infinite-loop-direct.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+; RUN: opt -S -gvn-hoist -gvn-hoist-check-profitability=false < %s | FileCheck %s
 
 ; Checking gvn-hoist in case of infinite loops and irreducible control flow.
 
diff --git a/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll b/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
--- a/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
+++ b/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+; RUN: opt -S -gvn-hoist -gvn-hoist-check-profitability=false < %s | FileCheck %s
 
 ; Checking gvn-hoist in case of indirect branches.
 
diff --git a/llvm/test/Transforms/GVNHoist/int_sideeffect.ll b/llvm/test/Transforms/GVNHoist/int_sideeffect.ll
--- a/llvm/test/Transforms/GVNHoist/int_sideeffect.ll
+++ b/llvm/test/Transforms/GVNHoist/int_sideeffect.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S < %s -gvn-hoist | FileCheck %s
+; RUN: opt -S < %s -gvn-hoist -gvn-hoist-check-profitability=false | FileCheck %s
 
 declare void @llvm.sideeffect()
 
diff --git a/llvm/test/Transforms/GVNHoist/pr37445.ll b/llvm/test/Transforms/GVNHoist/pr37445.ll
--- a/llvm/test/Transforms/GVNHoist/pr37445.ll
+++ b/llvm/test/Transforms/GVNHoist/pr37445.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -early-cse-memssa -gvn-hoist -S | FileCheck %s
+; RUN: opt < %s -early-cse-memssa -gvn-hoist -gvn-hoist-check-profitability=false -S | FileCheck %s
 
 ; Make sure opt won't crash and that this pair of
 ; instructions (load, icmp) is hoisted successfully
diff --git a/llvm/test/Transforms/GVNHoist/sink-load-dom-numbering.ll b/llvm/test/Transforms/GVNHoist/sink-load-dom-numbering.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/sink-load-dom-numbering.ll
@@ -0,0 +1,58 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+; CHECK: load
+; CHECK-NOT: load
+
+%struct.blam = type { %struct.wibble }
+%struct.wibble = type { i8*, i32 }
+
+declare void @hoge()
+
+define void @eggs() {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  br label %bb2
+
+bb2:                                              ; preds = %bb1
+  br i1 undef, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb2
+  unreachable
+
+bb4:                                              ; preds = %bb2
+  br label %bb5
+
+bb5:                                              ; preds = %bb4
+  br i1 undef, label %bb6, label %bb9
+
+bb6:                                              ; preds = %bb5
+  br i1 undef, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb6
+  call void @hoge()
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb6
+  %tmp = load %struct.blam*, %struct.blam** undef, align 8, !tbaa !1
+  br label %bb11
+
+bb9:                                              ; preds = %bb5
+  %tmp10 = load %struct.blam*, %struct.blam** undef, align 8, !tbaa !1
+  br label %bb11
+
+bb11:                                             ; preds = %bb9, %bb8
+  %tmp12 = phi %struct.blam* [ %tmp, %bb8 ], [ %tmp10, %bb9 ]
+  unreachable
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 "}
+!1 = !{!2, !3, i64 8}
+!2 = !{!"tree_common", !3, i64 0, !3, i64 8, !6, i64 16, !6, i64 17, !6, i64 17, !6, i64 17, !6, i64 17, !6, i64 17, !6, i64 17, !6, i64 17, !6, i64 17, !6, i64 18, !6, i64 18, !6, i64 18, !6, i64 18, !6, i64 18, !6, i64 18, !6, i64 18, !6, i64 18, !6, i64 19, !6, i64 19, !6, i64 19, !6, i64 19, !6, i64 19, !6, i64 19, !6, i64 19, !6, i64 19}
+!3 = !{!"any pointer", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!"int", !4, i64 0}
diff --git a/llvm/test/Transforms/GVNHoist/sink-load-with-memory-phi-1.ll b/llvm/test/Transforms/GVNHoist/sink-load-with-memory-phi-1.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/sink-load-with-memory-phi-1.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+; Check that load is sunk to its successor.
+; CHECK: load
+; CHECK-NOT: load
+
+@reg_last_set_invalid = external global i8*, align 8
+
+; Function Attrs: nounwind uwtable
+define void @record_value_for_reg() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br label %for.cond139
+
+for.cond139:                                      ; preds = %if.end156, %for.cond
+  br i1 undef, label %for.body142, label %for.end159
+
+for.body142:                                      ; preds = %for.cond139
+  store i32 undef, i32* undef, align 4, !tbaa !1
+  br i1 undef, label %land.lhs.true146, label %if.else
+
+land.lhs.true146:                                 ; preds = %for.body142
+  br i1 undef, label %if.then151, label %if.else
+
+if.then151:                                       ; preds = %land.lhs.true146
+  %0 = load i8*, i8** @reg_last_set_invalid, align 8, !tbaa !5
+  br label %if.end156
+
+if.else:                                          ; preds = %land.lhs.true146, %for.body142
+  %1 = load i8*, i8** @reg_last_set_invalid, align 8, !tbaa !5
+  br label %if.end156
+
+if.end156:                                        ; preds = %if.else, %if.then151
+  %.sink5 = phi i8* [ %1, %if.else ], [ %0, %if.then151 ]
+  br label %for.cond139
+
+for.end159:                                       ; preds = %for.cond139
+  br i1 undef, label %land.lhs.true161, label %if.end174
+
+land.lhs.true161:                                 ; preds = %for.end159
+  br label %if.end174
+
+if.end174:                                        ; preds = %land.lhs.true161, %for.end159
+  unreachable
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !3, i64 0}
diff --git a/llvm/test/Transforms/GVNHoist/sink-load-with-memory-phi-2.ll b/llvm/test/Transforms/GVNHoist/sink-load-with-memory-phi-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/sink-load-with-memory-phi-2.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+; CHECK: load
+; CHECK-NOT: load
+@reg_last_set_invalid = external global i8*, align 8
+
+; Function Attrs: nounwind uwtable
+define void @record_value_for_reg() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br label %for.cond139
+
+for.cond139:                                      ; preds = %if.end156, %for.cond
+  br label %for.body142
+
+for.body142:                                      ; preds = %for.cond139
+  br i1 undef, label %land.lhs.true146, label %if.else
+
+land.lhs.true146:                                 ; preds = %for.body142
+  br i1 undef, label %if.then151, label %if.else
+
+if.then151:                                       ; preds = %land.lhs.true146
+  %0 = load i8*, i8** @reg_last_set_invalid, align 8, !tbaa !1
+  br label %if.end156
+
+if.else:                                          ; preds = %land.lhs.true146, %for.body142
+  %1 = load i8*, i8** @reg_last_set_invalid, align 8, !tbaa !1
+  br label %if.end156
+
+if.end156:                                        ; preds = %if.else, %if.then151
+  %.sink5 = phi i8* [ %1, %if.else ], [ %0, %if.then151 ]
+  store i8 undef, i8* undef, align 1, !tbaa !5
+  br label %for.cond139
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!3, !3, i64 0}