Index: include/llvm/Transforms/Utils/MemorySSA.h
===================================================================
--- include/llvm/Transforms/Utils/MemorySSA.h
+++ include/llvm/Transforms/Utils/MemorySSA.h
@@ -518,6 +518,31 @@
     return It == PerBlockAccesses.end() ? nullptr : It->second.get();
   }
 
+  enum InsertionPlace { Beginning, End };
+
+  /// \brief Create a MemoryAccess in MemorySSA at a specified point in a block,
+  /// with a specified clobbering definition.
+  /// This should be called when a memory instruction is created that is being
+  /// used to replace an existing memory instruction. It will *not* create PHI
+  /// nodes, or verify the clobbering definition. The insertion place is used
+  /// solely to determine where in the memoryssa access lists the instruction
+  /// will be placed.  It will return the new MemoryAccess.
+  MemoryAccess *createMemoryAccess(Instruction *I, MemoryAccess *Definition,
+                                   const BasicBlock *BB, InsertionPlace Point);
+  /// \brief Create a MemoryAccess in MemorySSA before or after an existing
+  /// MemoryAccess with a specified clobbering definition.
+  /// This should be called when a memory instruction is created that is being
+  /// used to replace an existing memory instruction. It will *not* create PHI
+  /// nodes, or verify the clobbering definition. The insertion place is used
+  /// solely to determine where in the memoryssa access lists the instruction
+  /// will be placed.  It will return the new MemoryAccess.
+  MemoryAccess *createMemoryAccessBefore(Instruction *I,
+                                         MemoryAccess *Definition,
+                                         MemoryAccess *InsertPt);
+  MemoryAccess *createMemoryAccessAfter(Instruction *I,
+                                        MemoryAccess *Definition,
+                                        MemoryAccess *InsertPt);
+
   /// \brief Remove a MemoryAccess from MemorySSA, including updating all
   /// definitions and uses.
   /// This should be called when a memory instruction that has a MemoryAccess
@@ -526,8 +551,6 @@
   /// on the MemoryAccess for that store/load.
   void removeMemoryAccess(MemoryAccess *);
 
-  enum InsertionPlace { Beginning, End };
-
   /// \brief Given two memory accesses in the same basic block, determine
   /// whether MemoryAccess \p A dominates MemoryAccess \p B.
   bool locallyDominates(const MemoryAccess *A, const MemoryAccess *B) const;
@@ -554,13 +577,14 @@
   void markUnreachableAsLiveOnEntry(BasicBlock *BB);
   bool dominatesUse(const MemoryAccess *, const MemoryAccess *) const;
   MemoryAccess *createNewAccess(Instruction *, bool ignoreNonMemory = false);
+  MemoryUseOrDef *createDefinedAccess(Instruction *, MemoryAccess *);
   MemoryAccess *findDominatingDef(BasicBlock *, enum InsertionPlace);
   void removeFromLookups(MemoryAccess *);
 
   MemoryAccess *renameBlock(BasicBlock *, MemoryAccess *);
   void renamePass(DomTreeNode *, MemoryAccess *IncomingVal,
                   SmallPtrSet<BasicBlock *, 16> &Visited);
-  AccessListType *getOrCreateAccessList(BasicBlock *);
+  AccessListType *getOrCreateAccessList(const BasicBlock *);
   AliasAnalysis *AA;
   DominatorTree *DT;
   Function &F;
Index: lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
===================================================================
--- lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -90,8 +90,10 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include <queue>
 #include <vector>
+#include <unordered_set>
 
 using namespace llvm;
 
@@ -102,14 +104,67 @@
 //===----------------------------------------------------------------------===//
 
 namespace {
+
+// This hash matches the most common things isSameOperationAs checks. It must
+// always be a subset or equal to isSameOperationAs for everything to function
+// properly.
+
+struct StoreInstHash {
+  size_t operator()(const StoreInst *A) const {
+    return hash_combine(A->getType(), A->getPointerOperand()->getType(),
+                        A->getValueOperand()->getType(), A->isVolatile(),
+                        A->getAlignment(), A->getOrdering(),
+                        A->getSynchScope());
+  }
+};
+
+struct StoreInstEq {
+  bool operator()(const StoreInst *A, const StoreInst *B) const {
+    return (A == B || A->isSameOperationAs(B));
+  }
+};
+
+typedef std::pair<LoadInst *, const MemoryAccess *> LoadPair;
+
+// This hash has two parts. The first is the instruction, the second, the
+// clobbering MemorySSA access.  For the instruction part, this hash matches the
+// most common things isSameOperationAs checks. It must always be a subset or
+// equal to isSameOperationAs for everything to function properly.
+
+struct LoadPairHash {
+  size_t operator()(const LoadPair &LP) const {
+    const LoadInst *A = LP.first;
+    return hash_combine(A->getType(), A->getPointerOperand()->getType(),
+                        A->isVolatile(), A->getAlignment(), A->getOrdering(),
+                        A->getSynchScope(), LP.second);
+  }
+};
+
+struct LoadPairEq {
+  bool operator()(const LoadPair &A, const LoadPair &B) const {
+    return (A.second == B.second &&
+            (A.first == B.first || A.first->isSameOperationAs(B.first)));
+  }
+};
+
 class MergedLoadStoreMotion : public FunctionPass {
   AliasAnalysis *AA;
   MemoryDependenceAnalysis *MD;
+  MemorySSA *MSSA;
+  MemorySSAWalker *CachingWalker;
+  DominatorTree *DT;
+  bool UseMemorySSA;
+  // The non-MemorySSA versions of mergeLoad/Store algorithms could have Size0 *
+  // Size1 complexity, where Size0 and Size1 are the #instructions on the two
+  // sides of the diamond. The constant chosen here is arbitrary. Compiler Time
+  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+  const int MagicCompileTimeControl;
 
 public:
   static char ID; // Pass identification, replacement for typeid
   MergedLoadStoreMotion()
-      : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {
+      : FunctionPass(ID), MD(nullptr), UseMemorySSA(true),
+        MagicCompileTimeControl(250) {
     initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());
   }
 
@@ -120,8 +175,11 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemorySSALazy>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemorySSALazy>();
     AU.addPreserved<MemoryDependenceAnalysis>();
   }
 
@@ -135,15 +193,15 @@
   BasicBlock *getDiamondTail(BasicBlock *BB);
   bool isDiamondHead(BasicBlock *BB);
   // Routines for hoisting loads
-  bool isLoadHoistBarrierInRange(const Instruction& Start,
-                                 const Instruction& End,
-                                 LoadInst* LI);
+  bool isLoadHoistBarrierInRange(const Instruction &Start,
+                                 const Instruction &End, LoadInst *LI);
   LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
   void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
                         Instruction *ElseInst);
   bool isSafeToHoist(Instruction *I) const;
   bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
   bool mergeLoads(BasicBlock *BB);
+  bool mergeLoadsMemorySSA(BasicBlock *BB);
   // Routines for sinking stores
   StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
@@ -151,11 +209,12 @@
                                  const Instruction &End, MemoryLocation Loc);
   bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
   bool mergeStores(BasicBlock *BB);
-  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
-  // where Size0 and Size1 are the #instructions on the two sides of
-  // the diamond. The constant chosen here is arbitrary. Compiler Time
-  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
-  const int MagicCompileTimeControl;
+  void computeDFSNumbers();
+
+  std::unordered_multiset<LoadPair, LoadPairHash, LoadPairEq> LoadInfo;
+  std::unordered_multiset<StoreInst *, StoreInstHash, StoreInstEq> StoreInfo;
+  SmallPtrSet<MemoryAccess *, 8> AccessesToDelete;
+  DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap;
 };
 
 char MergedLoadStoreMotion::ID = 0;
@@ -176,6 +235,39 @@
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
                     "MergedLoadStoreMotion", false, false)
+/// \brief Compute DFS in/out numbers for dominator tree
+///
+/// Sadly, while the dominator tree has internal DFS numbers, we can't be
+/// guaranteed they are up to date, as they are an implementation detail only
+/// computed after a certain number of queries
+void MergedLoadStoreMotion::computeDFSNumbers() {
+  unsigned DFSNum = 0;
+  SmallVector<std::pair<const DomTreeNode *, DomTreeNode::const_iterator>, 32>
+      WorkStack;
+
+  WorkStack.emplace_back(DT->getRootNode(), DT->getRootNode()->begin());
+  DFSDomMap[DT->getRoot()].first = DFSNum++;
+
+  while (!WorkStack.empty()) {
+    const DomTreeNode *Node = WorkStack.back().first;
+    DomTreeNode::const_iterator ChildIt = WorkStack.back().second;
+    BasicBlock *BB = Node->getBlock();
+
+    // If we visited all of the children of this node, "recurse" back up the
+    // stack setting the DFOutNum.
+    if (ChildIt == Node->end()) {
+      DFSDomMap[BB].second = DFSNum++;
+      WorkStack.pop_back();
+    } else {
+      // Otherwise, recursively visit this child.
+      const DomTreeNode *Child = *ChildIt;
+      ++WorkStack.back().second;
+
+      WorkStack.emplace_back(Child, Child->begin());
+      DFSDomMap[Child->getBlock()].first = DFSNum++;
+    }
+  }
+}
 
 ///
 /// \brief Remove instruction from parent and update memory dependence analysis.
@@ -240,9 +332,9 @@
 /// being loaded or protect against the load from happening
 /// it is considered a hoist barrier.
 ///
-bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, 
-                                                      const Instruction& End,
-                                                      LoadInst* LI) {
+bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction &Start,
+                                                      const Instruction &End,
+                                                      LoadInst *LI) {
   MemoryLocation Loc = MemoryLocation::get(LI);
   return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
 }
@@ -306,6 +398,18 @@
 
   // Hoist instruction.
   HoistedInst->insertBefore(HoistPt);
+  // We also hoist operands of loads using this function, so check to see if
+  // this is really a memory access before we try to update MemorySSA for it.
+  MemoryAccess *HoistCandAccess = MSSA->getMemoryAccess(HoistCand);
+  if (HoistCandAccess) {
+    MemoryUseOrDef *MUD = cast<MemoryUseOrDef>(HoistCandAccess);
+    // What is happening here is that we are creating the hoisted access
+    // and destroying the old accesses.
+    MSSA->createMemoryAccess(HoistedInst, MUD->getDefiningAccess(), BB,
+                             MemorySSA::End);
+    MSSA->removeMemoryAccess(HoistCandAccess);
+    MSSA->removeMemoryAccess(MSSA->getMemoryAccess(ElseInst));
+  }
 
   HoistCand->replaceAllUsesWith(HoistedInst);
   removeInstruction(HoistCand);
@@ -348,6 +452,88 @@
   } else
     return false;
 }
+///
+/// \brief Try to hoist two loads to same address into diamond header
+///
+/// Starting from a diamond head block, iterate over the loads in one
+/// successor block, put them in the hash table.
+/// Then walk through loads in the successor block, and see if they match any in
+/// the hash table and can be hoisted.
+///
+bool MergedLoadStoreMotion::mergeLoadsMemorySSA(BasicBlock *BB) {
+  bool MergedLoads = false;
+  assert(isDiamondHead(BB));
+  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+  // #Instructions in Succ1 for Compile Time Control
+  int Size1 = Succ1->size();
+  int NLoads = 0;
+
+  // Skip if we don't have memory accesses in both blocks
+  auto *Succ0Accesses = MSSA->getBlockAccesses(Succ0);
+  if (!Succ0Accesses)
+    return false;
+  auto *Succ1Accesses = MSSA->getBlockAccesses(Succ1);
+  if (!Succ1Accesses)
+    return false;
+
+  for (auto &Access : *Succ0Accesses) {
+    const MemoryUse *MU = dyn_cast<MemoryUse>(&Access);
+    if (!MU)
+      continue;
+    Instruction *I = MU->getMemoryInst();
+    // Only move non-simple (atomic, volatile) loads.
+    LoadInst *Load0 = dyn_cast<LoadInst>(I);
+    if (!Load0 || !Load0->isSimple() || Load0->isUsedOutsideOfBlock(Succ0))
+      continue;
+
+    LoadInfo.insert({Load0, CachingWalker->getClobberingMemoryAccess(Load0)});
+    ++NLoads;
+    if (NLoads * Size1 >= MagicCompileTimeControl)
+      break;
+  }
+
+  for (auto AI = Succ1Accesses->begin(), AE = Succ1Accesses->end(); AI != AE;) {
+    auto CurrIter = AI;
+    ++AI;
+    const MemoryUse *MU = dyn_cast<MemoryUse>(&*CurrIter);
+    if (!MU)
+      continue;
+    LoadInst *Load1 = dyn_cast<LoadInst>(MU->getMemoryInst());
+    if (!Load1 || Load1->isUsedOutsideOfBlock(Succ1))
+      continue;
+
+    // We know that if the load has the same clobbering access as this one, they
+    // must not be killed until the same point.  That is, we are guaranteed that
+    // all the loads that could possibly be merged must have a common MemoryDef
+    // (or MemoryPhi) that they reach.  If not, then we can't merge them because
+    // something is in the way on one of the branches.  If we *do* find a
+    // possible match, the only further checking we need to do is to ensure they
+    // are loads of the same pointer.  We could simply check the pointer
+    // operands, but isMustAlias can do a better job of it.
+
+    auto LookupResult = LoadInfo.equal_range(
+        {Load1, CachingWalker->getClobberingMemoryAccess(Load1)});
+    bool Res = false;
+    auto LookupIter = LookupResult.first;
+    while (!Res && LookupIter != LookupResult.second) {
+      LoadInst *Load0 = LookupIter->first;
+      auto OldIter = LookupIter;
+      ++LookupIter;
+      MemoryLocation Loc0 = MemoryLocation::get(Load0);
+      MemoryLocation Loc1 = MemoryLocation::get(Load1);
+      if (AA->isMustAlias(Loc0, Loc1))
+        Res = hoistLoad(BB, Load0, Load1);
+      MergedLoads |= Res;
+      // Don't attempt to hoist above loads that had not been hoisted.
+      if (Res)
+        LoadInfo.erase(OldIter);
+    }
+  }
+  LoadInfo.clear();
+  return MergedLoads;
+}
 
 ///
 /// \brief Try to hoist two loads to same address into diamond header
@@ -416,17 +602,17 @@
     Instruction *Inst = &*RBI;
 
     if (!isa<StoreInst>(Inst))
-       continue;
+      continue;
 
     StoreInst *Store1 = cast<StoreInst>(Inst);
 
     MemoryLocation Loc0 = MemoryLocation::get(Store0);
     MemoryLocation Loc1 = MemoryLocation::get(Store1);
     if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
-      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
-                                 BB1->back(), Loc1) &&
-      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
-                                 BB0->back(), Loc0)) {
+        !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
+                                   BB1->back(), Loc1) &&
+        !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
+                                   BB0->back(), Loc0)) {
       return Store1;
     }
   }
@@ -567,20 +753,26 @@
 bool MergedLoadStoreMotion::runOnFunction(Function &F) {
   MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  if (UseMemorySSA) {
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    MSSA = &getAnalysis<MemorySSALazy>().getMSSA();
+    CachingWalker = MSSA->buildMemorySSA(AA, DT);
+  }
 
   bool Changed = false;
   DEBUG(dbgs() << "Instruction Merger\n");
 
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
-    BasicBlock *BB = &*FI++;
-
+  for (BasicBlock &BB : F) {
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
-    if (isDiamondHead(BB)) {
-      Changed |= mergeLoads(BB);
-      Changed |= mergeStores(getDiamondTail(BB));
+    if (isDiamondHead(&BB)) {
+      if (UseMemorySSA)
+        Changed |= mergeLoadsMemorySSA(&BB);
+      else
+        Changed |= mergeLoads(&BB);
+      Changed |= mergeStores(getDiamondTail(&BB));
     }
   }
   return Changed;
Index: lib/Transforms/Utils/MemorySSA.cpp
===================================================================
--- lib/Transforms/Utils/MemorySSA.cpp
+++ lib/Transforms/Utils/MemorySSA.cpp
@@ -215,7 +215,8 @@
       MA.dropAllReferences();
 }
 
-MemorySSA::AccessListType *MemorySSA::getOrCreateAccessList(BasicBlock *BB) {
+MemorySSA::AccessListType *
+MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
   auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
 
   if (Res.second)
@@ -356,6 +357,59 @@
 
   return Walker;
 }
+MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
+                                               MemoryAccess *Definition) {
+  MemoryAccess *NewAccess = createNewAccess(I, true);
+  assert(
+      NewAccess != nullptr &&
+      "Tried to create a memory access for a non-memory touching instruction");
+  MemoryUseOrDef *MUD = cast<MemoryUseOrDef>(NewAccess);
+  MUD->setDefiningAccess(Definition);
+  return MUD;
+}
+
+MemoryAccess *MemorySSA::createMemoryAccess(Instruction *I,
+                                            MemoryAccess *Definition,
+                                            const BasicBlock *BB,
+                                            InsertionPlace Point) {
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // It goes after any phi nodes
+    auto AI = Accesses->begin();
+    while (AI != Accesses->end()) {
+      if (!isa<MemoryPhi>(AI))
+        break;
+      ++AI;
+    }
+    Accesses->insert(AI, NewAccess);
+  } else {
+    Accesses->push_back(NewAccess);
+  }
+
+  return NewAccess;
+}
+MemoryAccess *MemorySSA::createMemoryAccessBefore(Instruction *I,
+                                                  MemoryAccess *Definition,
+                                                  MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
+  Accesses->insert(AccessListType::iterator(InsertPt), NewAccess);
+  return NewAccess;
+}
+
+MemoryAccess *MemorySSA::createMemoryAccessAfter(Instruction *I,
+                                                 MemoryAccess *Definition,
+                                                 MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
+  Accesses->insertAfter(AccessListType::iterator(InsertPt), NewAccess);
+  return NewAccess;
+}
 
 /// \brief Helper function to create new memory accesses
 MemoryAccess *MemorySSA::createNewAccess(Instruction *I, bool IgnoreNonMemory) {