Index: llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -14,7 +14,7 @@
 // diamond (hammock) and merges them into a single store to the tail block
 // (footer). The algorithm iterates over the instructions of one side of the
 // diamond and attempts to find a matching store on the other side. New
-// tail/footer block may be insterted if the tail/footer block has more
+// tail/footer block may be inserted if the tail/footer block has more
 // predecessors (not only the two predecessors that are forming the diamond). It
 // sinks when it thinks it safe to do so.  This optimization helps with eg.
 // hiding load latencies, triggering if-conversion, and reducing static code
@@ -62,6 +62,41 @@
 //                     <...>
 //
 //
+// This pass also replaces conditional stores with unconditional ones (subject to
+// constraints). Given a triangle shape before the transformation:
+//
+//            header:
+//                     br %cond, label %if.then, label %if.end
+//                        +                        +
+//                       +                         +
+//                      +                          +
+//            if.then:                             +
+//               store %s, %addr                   +
+//               br label %if.end                  +
+//                      +                          +
+//                       +                         +
+//                        +                        +
+//            if.end ("footer"):
+//                     <...>
+//
+// After the conditional store replacement, this becomes:
+//            header:
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//                br label %if.end                load  %s.old, %addr
+//                                                br label %if.end
+//                      +                        +
+//                       +                      +
+//                        +                    +
+//            if.end ("footer"):
+//                     %s.new = phi [%s, if.then], [%s.old, if.else]
+//                     <...>
+//                     store %s.new, %addr_s
+//                     <...>
+//
 //===----------------------- TODO -----------------------------------------===//
 //
 // 1) Generalize to regions other than diamonds
@@ -77,6 +112,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
@@ -94,6 +130,8 @@
 //===----------------------------------------------------------------------===//
 class MergedLoadStoreMotion {
   AliasAnalysis *AA = nullptr;
+  MemorySSA *MSSA = nullptr;
+  DominatorTree *DT = nullptr;
 
   // The mergeStores algorithms could have Size0 * Size1 complexity,
   // where Size0 and Size1 are the number instructions on the two sides of
@@ -104,11 +142,13 @@
   const bool SplitFooterBB;
 public:
   MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
-  std::pair<bool, bool> run(Function &F, AliasAnalysis &AA);
+  std::pair<bool, bool> run(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                            DominatorTree &DT);
 
 private:
   bool isDiamondHead(BasicBlock *BB, BasicBlock *&Left, BasicBlock *&Right,
                      BasicBlock *&Tail);
+  bool isTriangleHead(BasicBlock *BB, BasicBlock *&Side, BasicBlock *&Tail);
   // Routines for sinking stores
   StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
@@ -119,6 +159,11 @@
                          StoreInst *ElseInst);
   std::pair<bool, bool> mergeStores(BasicBlock *Head, BasicBlock *Left,
                                     BasicBlock *Right, BasicBlock *Tail);
+  void sinkStore(StoreInst *);
+  std::pair<bool, bool> checkEscapeAndDomination(const StoreInst *S,
+                                                 const AllocaInst *AI);
+  bool isNonTrappingStore(const StoreInst *S, bool IsLocal, bool IsEscaping);
+  bool isLegalToReplaceStore(const StoreInst *S);
 };
 } // end anonymous namespace
 
@@ -148,6 +193,35 @@
   return Tail && Tail == Right->getSingleSuccessor();
 }
 
+///
+/// True when BB is the head of a triangle
+///
+bool MergedLoadStoreMotion::isTriangleHead(BasicBlock *BB, BasicBlock *&Side,
+                                           BasicBlock *&Tail) {
+  if (!BB)
+    return false;
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+
+  if (Succ0->hasNPredecessors(1)) {
+    Side = Succ0;
+    Tail = Succ1;
+    return Side->getSingleSuccessor() == Tail;
+  }
+
+  if (Succ1->hasNPredecessors(1)) {
+    Side = Succ1;
+    Tail = Succ0;
+    return Side->getSingleSuccessor() == Tail;
+  }
+
+  return false;
+}
+
 ///
 /// True when instruction is a sink barrier for a store
 /// located in Loc
@@ -320,15 +394,243 @@
   return {MergedStores, SplitFooter};
 }
 
-std::pair<bool, bool> MergedLoadStoreMotion::run(Function &F,
-                                                 AliasAnalysis &AA) {
-  this->AA = &AA;
+///
+/// Check whether the address `AI` of a local variable may escape from the
+/// function and whether a load from or a store to precisely the same
+/// location as the store `S` properly dominates `S`. `S` must be a simple store
+/// to the object, allocated by `AI`. Return the escape property in the first
+/// element and the domination property in the second element of the pair.
+///
+std::pair<bool, bool>
+MergedLoadStoreMotion::checkEscapeAndDomination(const StoreInst *S,
+                                                const AllocaInst *AI) {
+  bool Escape = false;
+  bool Dominate = false;
+  SmallPtrSet<const PHINode *, 4> Visited;
+  SmallVector<const Value *, 16> Worklist;
+
+  Worklist.push_back(AI);
+  while (!(Escape && Dominate) && !Worklist.empty()) {
+    const Value *V = Worklist.pop_back_val();
+    for (const User *U : V->users()) {
+      const Instruction *I = dyn_cast<Instruction>(U);
+      if (I == nullptr || I == S)
+        continue;
+      switch (I->getOpcode()) {
+      default:
+        // If not handled in one of the other cases, conservatively assume the
+        // value escapes.
+        Escape = true;
+        break;
+      case Instruction::Ret:
+      case Instruction::AtomicRMW:
+        // A pointer cannot escape via these instructions.
+        break;
+      case Instruction::Load:
+        if (!Dominate &&
+            AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) ==
+                AliasResult::MustAlias)
+          Dominate = DT->dominates(I, S);
+        break;
+      case Instruction::Call: {
+        const auto *CI = cast<CallInst>(I);
+        if (CI->isDebugOrPseudoInst() || CI->isLifetimeStartOrEnd())
+          break;
+      }
+        LLVM_FALLTHROUGH;
+      case Instruction::Invoke:
+      case Instruction::CallBr:
+        if (any_of(cast<CallBase>(I)->data_ops(),
+                   [=](const Use &Arg) { return Arg.get() == V; }))
+          Escape = true;
+        break;
+      case Instruction::Store:
+        if (V == cast<StoreInst>(I)->getValueOperand())
+          Escape = true;
+        if (!Dominate &&
+            AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) ==
+                AliasResult::MustAlias)
+          Dominate = DT->dominates(I, S);
+        break;
+      case Instruction::AtomicCmpXchg:
+        if (V == cast<AtomicCmpXchgInst>(I)->getNewValOperand())
+          Escape = true;
+        break;
+      case Instruction::PHI:
+        if (!Visited.insert(cast<PHINode>(I)).second)
+          break;
+        LLVM_FALLTHROUGH;
+      case Instruction::GetElementPtr:
+      case Instruction::BitCast:
+      case Instruction::AddrSpaceCast:
+      case Instruction::Select:
+        Worklist.push_back(I);
+        break;
+      }
+    }
+  }
 
-  bool Changed = false;
-  bool CFGChanged = false;
+  return {Escape, Dominate};
+}
+
+bool MergedLoadStoreMotion::isNonTrappingStore(const StoreInst *S, bool IsLocal,
+                                               bool IsEscaping) {
+  SmallVector<const MemoryAccess *, 16> Worklist;
+  SmallPtrSet<const MemoryPhi *, 4> Visited;
+
+  MemoryAccess *SDef = MSSA->getMemoryAccess(S);
+  Worklist.append(SDef->defs_begin(), SDef->defs_end());
+
+  while (!Worklist.empty()) {
+    const MemoryAccess *M = Worklist.pop_back_val();
+
+    // Can't guarantee anything if we reach the initial node.
+    if (MSSA->isLiveOnEntryDef(M))
+      return false;
+
+    // Continue traversing along the MemoryPhi operands.
+    if (const auto *Phi = dyn_cast<MemoryPhi>(M)) {
+      if (Visited.insert(Phi).second)
+        Worklist.append(Phi->defs_begin(), Phi->defs_end());
+      continue;
+    }
+
+    assert(isa<MemoryDef>(M) && "Unexpected MemoryUse");
+    const auto *MDef = cast<MemoryDef>(M);
+    Instruction *I = MDef->getMemoryInst();
+
+    // Prevent an instruction from "shielding" itself.
+    if (I == S)
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      // Unknown MemoryDef or an atomic instruction, assume invalidating and
+      // bail out.
+      return false;
+    case Instruction::Store:
+      // Ordered or volatile store, assume invalidating.
+      if (!cast<StoreInst>(I)->isSimple())
+        return false;
+      // A write to precisely the same location feeding into the candidate store
+      // "shields" it from traps along the dependency chain.
+      // A write to a separate or partially overlapping memory location does
+      // not guarantee the candidate store is non-trapping, but does not
+      // preclude it from being such either, so continue traversing up the
+      // dependency chain.
+      if (AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) !=
+          AliasResult::MustAlias)
+        Worklist.push_back(MDef->getDefiningAccess());
+      break;
+    case Instruction::Call:
+      if (!IsLocal || IsEscaping) {
+        const auto *CI = cast<CallInst>(I);
+        if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
+          // TODO: Continue traversing across well-behaved calls: stdlib
+          // functions, readonly/readnone, etc.
+          return false;
+      }
+      Worklist.push_back(MDef->getDefiningAccess());
+      break;
+    }
+  }
+
+  return true;
+}
+
+// Check if it's legal to replace a conditional store with a load and
+// unconditional store.
+bool MergedLoadStoreMotion::isLegalToReplaceStore(const StoreInst *S) {
+  if (!S->isSimple())
+    return false;
+  bool IsLocal = false, IsEscaping = false, HasDominating = false;
+  if (auto *AI =
+          dyn_cast<AllocaInst>(getUnderlyingObject(S->getPointerOperand()))) {
+    IsLocal = true;
+    std::tie(IsEscaping, HasDominating) = checkEscapeAndDomination(S, AI);
+  }
+  if (IsLocal && !IsEscaping && HasDominating)
+    return true;
+  return isNonTrappingStore(S, IsLocal, IsEscaping);
+}
+
+void MergedLoadStoreMotion::sinkStore(StoreInst *S) {
+  BasicBlock *Body = S->getParent();
+  BasicBlock *Head = Body->getSinglePredecessor();
+  BasicBlock *Tail = Body->getSingleSuccessor();
+  assert(Head && Tail && "Invalid CFG shape for single store replacement");
+
+  // Create a load on the "other" side of the branch.
+  BasicBlock *NewBody = SplitBlockPredecessors(Tail, {Head}, ".alt");
+  Value *V = S->getValueOperand();
+  auto *L =
+      new LoadInst(V->getType(), S->getPointerOperand(), V->getName() + ".old",
+                   &*NewBody->getFirstInsertionPt());
+
+  // Sink the store to the tail block.
+  if (Tail->hasNPredecessorsOrMore(3))
+    Tail = SplitBlockPredecessors(Tail, {Head, Body}, ".sink");
+
+  LLVM_DEBUG(dbgs() << "Sink single store into";
+             Tail->printAsOperand(dbgs(), false);
+             dbgs() << "Store: ";S->dump();
+             dbgs() << "Load:  "; L->dump());
+  S->removeFromParent();
+  S->insertBefore(&*Tail->getFirstInsertionPt());
+
+  auto *Phi =
+      PHINode::Create(V->getType(), 2, V->getName() + ".new", &Tail->front());
+  Phi->addIncoming(V, Body);
+  Phi->addIncoming(L, NewBody);
+  S->setOperand(0, Phi);
+}
+
+std::pair<bool, bool> MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA,
+                                                 MemorySSA &MSSA,
+                                                 DominatorTree &DT) {
+  this->AA = &AA;
+  this->MSSA = &MSSA;
+  this->DT = &DT;
 
   LLVM_DEBUG(dbgs() << "Instruction Merger\n");
 
+  // Collect conditional stores which are legal to replace with unconditional
+  // ones. Do this before making any changes to the function, so we are using up
+  // to date dominator tree and Memory SSA graph.
+  SmallVector<StoreInst *, 4> CondStores;
+  for (BasicBlock &BB : F) {
+    BasicBlock *IfBody, *IfTail;
+    if (!isTriangleHead(&BB, IfBody, IfTail))
+      continue;
+
+    // Have to insert a new tail, but it's disallowed.
+    if (!SplitFooterBB && IfTail->hasNPredecessorsOrMore(3))
+      continue;
+
+    // Don't sink if there are more than two (terminator and store) instructions
+    // in the block.
+    Instruction *I = IfBody->getTerminator()->getPrevNonDebugInstruction(true);
+    if (I == nullptr)
+      continue;
+    if (Instruction *P = I->getPrevNonDebugInstruction(true))
+      continue;
+
+    // Don't sink if it could introduce data race or invalid memory
+    // access to do so.
+    auto *S = dyn_cast<StoreInst>(I);
+    if (S == nullptr || !isLegalToReplaceStore(S))
+      continue;
+
+    CondStores.push_back(S);
+  }
+
+  // Now that we have collected the stores, do the replacement.
+  for (StoreInst *S : CondStores)
+    sinkStore(S);
+
+  bool CFGChanged = !CondStores.empty();
+  bool Changed = false;
+
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
   // This loop doesn't care about newly inserted/split blocks 
@@ -343,6 +645,7 @@
       CFGChanged |= ChangeCFG;
     }
   }
+
   return {Changed, CFGChanged};
 }
 
@@ -366,15 +669,17 @@
     MergedLoadStoreMotion Impl(SplitFooterBB);
     bool Change, _;
     std::tie(Change, _) =
-        Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
+        Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                 getAnalysis<DominatorTreeWrapperPass>().getDomTree());
     return Change;
   }
 
 private:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    if (!SplitFooterBB)
-      AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
@@ -392,6 +697,8 @@
 INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
                       "MergedLoadStoreMotion", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
                     "MergedLoadStoreMotion", false, false)
 
@@ -399,8 +706,11 @@
 MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
   MergedLoadStoreMotion Impl(Options.SplitFooterBB);
   auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
   bool Change, CFGChange;
-  std::tie(Change, CFGChange) = Impl.run(F, AA);
+  std::tie(Change, CFGChange) = Impl.run(F, AA, MSSA, DT);
   if (!Change && !CFGChange)
     return PreservedAnalyses::all();
 
Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -489,8 +489,12 @@
 ; GCN-O2-NEXT:         Unroll loops
 ; GCN-O2-NEXT:       SROA
 ; GCN-O2-NEXT:       Function Alias Analysis Results
+; GCN-O2-NEXT:       Memory SSA
 ; GCN-O2-NEXT:       MergedLoadStoreMotion
+; GCN-O2-NEXT:       Dominator Tree Construction
+; GCN-O2-NEXT:       Natural Loop Information
 ; GCN-O2-NEXT:       Phi Values Analysis
+; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Memory Dependence Analysis
 ; GCN-O2-NEXT:       Lazy Branch Probability Analysis
@@ -848,8 +852,12 @@
 ; GCN-O3-NEXT:         Unroll loops
 ; GCN-O3-NEXT:       SROA
 ; GCN-O3-NEXT:       Function Alias Analysis Results
+; GCN-O3-NEXT:       Memory SSA
 ; GCN-O3-NEXT:       MergedLoadStoreMotion
+; GCN-O3-NEXT:       Dominator Tree Construction
+; GCN-O3-NEXT:       Natural Loop Information
 ; GCN-O3-NEXT:       Phi Values Analysis
+; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Memory Dependence Analysis
 ; GCN-O3-NEXT:       Lazy Branch Probability Analysis
Index: llvm/test/Other/opt-LTO-pipeline.ll
===================================================================
--- llvm/test/Other/opt-LTO-pipeline.ll
+++ llvm/test/Other/opt-LTO-pipeline.ll
@@ -117,9 +117,12 @@
 ; CHECK-NEXT:       Dead Store Elimination
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       MergedLoadStoreMotion
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       LCSSA Verifier
 ; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
Index: llvm/test/Other/opt-O2-pipeline.ll
===================================================================
--- llvm/test/Other/opt-O2-pipeline.ll
+++ llvm/test/Other/opt-O2-pipeline.ll
@@ -136,8 +136,12 @@
 ; CHECK-NEXT:           Unroll loops
 ; CHECK-NEXT:         SROA
 ; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MergedLoadStoreMotion
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory Dependence Analysis
 ; CHECK-NEXT:         Lazy Branch Probability Analysis
Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
===================================================================
--- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
+++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
@@ -141,8 +141,12 @@
 ; CHECK-NEXT:           Unroll loops
 ; CHECK-NEXT:         SROA
 ; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MergedLoadStoreMotion
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory Dependence Analysis
 ; CHECK-NEXT:         Lazy Branch Probability Analysis
Index: llvm/test/Other/opt-O3-pipeline.ll
===================================================================
--- llvm/test/Other/opt-O3-pipeline.ll
+++ llvm/test/Other/opt-O3-pipeline.ll
@@ -141,8 +141,12 @@
 ; CHECK-NEXT:           Unroll loops
 ; CHECK-NEXT:         SROA
 ; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MergedLoadStoreMotion
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory Dependence Analysis
 ; CHECK-NEXT:         Lazy Branch Probability Analysis
Index: llvm/test/Other/opt-Os-pipeline.ll
===================================================================
--- llvm/test/Other/opt-Os-pipeline.ll
+++ llvm/test/Other/opt-Os-pipeline.ll
@@ -122,8 +122,12 @@
 ; CHECK-NEXT:           Unroll loops
 ; CHECK-NEXT:         SROA
 ; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MergedLoadStoreMotion
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory Dependence Analysis
 ; CHECK-NEXT:         Lazy Branch Probability Analysis