Index: include/llvm/Transforms/Utils/MemorySSA.h
===================================================================
--- include/llvm/Transforms/Utils/MemorySSA.h
+++ include/llvm/Transforms/Utils/MemorySSA.h
@@ -752,7 +752,7 @@
 
 private:
   MemoryAccessPair UpwardsDFSWalk(MemoryAccess *, const MemoryLocation &,
-                                  UpwardsMemoryQuery &, bool);
+                                  UpwardsMemoryQuery &, bool, bool);
   MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
   bool instructionClobbersQuery(const MemoryDef *, UpwardsMemoryQuery &,
                                 const MemoryLocation &Loc) const;
Index: lib/Transforms/Utils/MemorySSA.cpp
===================================================================
--- lib/Transforms/Utils/MemorySSA.cpp
+++ lib/Transforms/Utils/MemorySSA.cpp
@@ -872,11 +872,14 @@
 
 MemoryAccessPair CachingMemorySSAWalker::UpwardsDFSWalk(
     MemoryAccess *StartingAccess, const MemoryLocation &Loc,
-    UpwardsMemoryQuery &Q, bool FollowingBackedge) {
+    UpwardsMemoryQuery &Q, bool FollowingBackedge, bool SeenBackedge) {
   MemoryAccess *ModifyingAccess = nullptr;
 
   auto DFI = df_begin(StartingAccess);
-  for (auto DFE = df_end(StartingAccess); DFI != DFE;) {
+  // FIXME: Refactor this loop
+  for (auto DFE = df_end(StartingAccess); DFI != DFE; ++DFI) {
+    assert(!ModifyingAccess &&
+           "ModifyingAccess was set, but we're still in the loop?");
     MemoryAccess *CurrAccess = *DFI;
     if (MSSA->isLiveOnEntryDef(CurrAccess))
       return {CurrAccess, Loc};
@@ -894,81 +897,107 @@
       }
     }
 
-    // We need to know whether it is a phi so we can track backedges.
-    // Otherwise, walk all upward defs.
-    if (!isa<MemoryPhi>(CurrAccess)) {
-      ++DFI;
+    // If this isn't a phi, we can continue walking all upward defs.
+    if (!isa<MemoryPhi>(CurrAccess))
       continue;
-    }
 
     // Recurse on PHI nodes, since we need to change locations.
     // TODO: Allow graphtraits on pairs, which would turn this whole function
     // into a normal single depth first walk.
     MemoryAccess *FirstDef = nullptr;
-    DFI = DFI.skipChildren();
     const MemoryAccessPair PHIPair(CurrAccess, Loc);
-    bool VisitedOnlyOne = true;
-    for (auto MPI = upward_defs_begin(PHIPair), MPE = upward_defs_end();
-         MPI != MPE; ++MPI) {
-      // Don't follow this path again if we've followed it once
-      if (!Q.Visited.insert(*MPI).second)
-        continue;
-
-      bool Backedge =
-          !FollowingBackedge &&
-          DT->dominates(CurrAccess->getBlock(), MPI.getPhiArgBlock());
-
-      MemoryAccessPair CurrentPair =
-          UpwardsDFSWalk(MPI->first, MPI->second, Q, Backedge);
-      // All the phi arguments should reach the same point if we can bypass
-      // this phi. The alternative is that they hit this phi node, which
-      // means we can skip this argument.
-      if (FirstDef && CurrentPair.first != PHIPair.first &&
-          CurrentPair.first != FirstDef) {
-        ModifyingAccess = CurrAccess;
-        break;
+    // If all phis end up pointing to the same place, we can skip this phi.
+    // This loop tries to detect these cases.
+    auto MPI = upward_defs_begin(PHIPair), MPE = upward_defs_end();
+    for (; MPI != MPE; ++MPI) {
+      MemoryAccess *Clobber;
+      // There's no point in putting liveOnEntry into `Visited`, nor is there a
+      // point in trying to walk it.
+      if (MSSA->isLiveOnEntryDef(MPI->first)) {
+        Clobber = MPI->first;
+      } else {
+        // If we were able to find this in the cache, we can get away without
+        // putting this in the `Visited` map, because we'll never recurse on it.
+        Clobber = doCacheLookup(MPI->first, Q, MPI->second);
+        if (!Clobber) {
+          bool SeenBefore = !Q.Visited.insert(*MPI).second;
+          // If the cache gives us nothing, we should walk. We need to be
+          // careful if we've seen this before *and* we've seen a back-edge,
+          // because we may have a phi that cycles back to itself. For this
+          // reason, we need to answer conservatively.
+          if (SeenBefore && SeenBackedge)
+            break;
+
+          bool Backedge =
+              !FollowingBackedge &&
+              DT->dominates(CurrAccess->getBlock(), MPI.getPhiArgBlock());
+
+          Clobber = UpwardsDFSWalk(MPI->first, MPI->second, Q, Backedge,
+                                   Backedge || SeenBackedge)
+                        .first;
+        }
       }
 
+      // If we looped back to this phi, no interesting defs happen on any of the
+      // paths we took, so we can safely ignore it.
+      if (Clobber == CurrAccess)
+        continue;
+
       if (!FirstDef)
-        FirstDef = CurrentPair.first;
-      else
-        VisitedOnlyOne = false;
+        FirstDef = Clobber;
+      else if (FirstDef != Clobber)
+        break;
     }
 
-    // The above loop determines if all arguments of the phi node reach the
-    // same place. However we skip arguments that are cyclically dependent
-    // only on the value of this phi node. This means in some cases, we may
-    // only visit one argument of the phi node, and the above loop will
-    // happily say that all the arguments are the same. However, in that case,
-    // we still can't walk past the phi node, because that argument still
-    // kills the access unless we hit the top of the function when walking
-    // that argument.
-    if (VisitedOnlyOne && FirstDef && !MSSA->isLiveOnEntryDef(FirstDef))
+#ifndef NDEBUG
+    // The above loop should examine all of the phi's children, so we can skip
+    // them. Because all nodes except Phis only have one edge, skipping the
+    // children of a phi should terminate our DFS.
+    //
+    // Don't call skipChildren directly on the iterator because doing so deletes
+    // the iterator's search stack, and we need that so we can cache everything.
+    auto DFICopy = DFI;
+    assert(DFICopy.skipChildren() == DFE);
+#endif
+
+    if (MPI != MPE) {
+      // If we didn't examine everything, we need to answer conservatively.
       ModifyingAccess = CurrAccess;
+    } else {
+      // FirstDef can only be null if all "Clobber"s pointed back to this phi,
+      // which would imply we have a loop with no entry. While this isn't
+      // a valid construct, we still need to handle it, because users can ask
+      // the walker to walk arbitrary IR.
+      ModifyingAccess = FirstDef ? FirstDef : MSSA->getLiveOnEntryDef();
+    }
+    break;
   }
 
-  if (!ModifyingAccess)
-    return {MSSA->getLiveOnEntryDef(), Q.StartingLoc};
+  if (!ModifyingAccess) {
+    // If DFI hit the end, it will have 0 path entries. Give up early.
+    assert(DFI == df_end(StartingAccess) &&
+           "Loop should set ModifyingAccess before exiting early.");
+    return {MSSA->getLiveOnEntryDef(), Loc};
+  }
 
-  const BasicBlock *OriginalBlock = Q.OriginalAccess->getBlock();
+  const BasicBlock *OriginalBlock = StartingAccess->getBlock();
   unsigned N = DFI.getPathLength();
-  MemoryAccess *FinalAccess = ModifyingAccess;
   for (; N != 0; --N) {
-    ModifyingAccess = DFI.getPath(N - 1);
-    BasicBlock *CurrBlock = ModifyingAccess->getBlock();
+    MemoryAccess *PathAccess = DFI.getPath(N - 1);
+    BasicBlock *CurrBlock = PathAccess->getBlock();
     if (!FollowingBackedge)
-      doCacheInsert(ModifyingAccess, FinalAccess, Q, Loc);
+      doCacheInsert(PathAccess, ModifyingAccess, Q, Loc);
     if (DT->dominates(CurrBlock, OriginalBlock) &&
         (CurrBlock != OriginalBlock || !FollowingBackedge ||
-         MSSA->locallyDominates(ModifyingAccess, Q.OriginalAccess)))
+         MSSA->locallyDominates(PathAccess, StartingAccess)))
       break;
   }
 
   // Cache everything else on the way back. The caller should cache
   // Q.OriginalAccess for us.
   for (; N != 0; --N) {
-    MemoryAccess *CacheAccess = DFI.getPath(N - 1);
-    doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
+    MemoryAccess *PathAccess = DFI.getPath(N - 1);
+    doCacheInsert(PathAccess, ModifyingAccess, Q, Loc);
   }
   assert(Q.Visited.size() < 1000 && "Visited too much");
 
@@ -982,7 +1011,7 @@
 MemoryAccess *
 CachingMemorySSAWalker::getClobberingMemoryAccess(MemoryAccess *StartingAccess,
                                                   UpwardsMemoryQuery &Q) {
-  return UpwardsDFSWalk(StartingAccess, Q.StartingLoc, Q, false).first;
+  return UpwardsDFSWalk(StartingAccess, Q.StartingLoc, Q, false, false).first;
 }
 
 MemoryAccess *
Index: test/Transforms/Util/MemorySSA/cyclicphi.ll
===================================================================
--- test/Transforms/Util/MemorySSA/cyclicphi.ll
+++ test/Transforms/Util/MemorySSA/cyclicphi.ll
@@ -31,3 +31,95 @@
   %tmp79 = getelementptr inbounds i64, i64* %tmp78, i64 undef
   br label %bb26
 }
+
+; CHECK-LABEL: define void @quux_skip
+define void @quux_skip(%struct.hoge* noalias %f, i64* noalias %g) align 2 {
+  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
+  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
+  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
+  br label %bb26
+
+bb26:                                             ; preds = %bb77, %0
+; CHECK: 3 = MemoryPhi({%0,liveOnEntry},{bb77,2})
+; CHECK-NEXT: br i1 undef, label %bb68, label %bb77
+  br i1 undef, label %bb68, label %bb77
+
+bb68:                                             ; preds = %bb26
+; CHECK: MemoryUse(3)
+; CHECK-NEXT: %tmp69 = load i64, i64* %g, align 8
+  %tmp69 = load i64, i64* %g, align 8
+; CHECK: 1 = MemoryDef(3)
+; CHECK-NEXT: store i64 %tmp69, i64* %g, align 8
+  store i64 %tmp69, i64* %g, align 8
+  br label %bb77
+
+bb77:                                             ; preds = %bb68, %bb26
+; CHECK: 2 = MemoryPhi({bb26,3},{bb68,1})
+; NOTE: we're currently a bit too conservative here; this could point to
+; liveOnEntry.
+; CHECK: MemoryUse(3)
+; CHECK-NEXT: %tmp78 = load i64*, i64** %tmp25, align 8
+  %tmp78 = load i64*, i64** %tmp25, align 8
+  %tmp79 = getelementptr inbounds i64, i64* %tmp78, i64 undef
+  br label %bb26
+}
+
+; CHECK-LABEL: define void @quux_dominated
+define void @quux_dominated(%struct.hoge* noalias %f, i64* noalias %g) align 2 {
+  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
+  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
+  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
+  br label %bb26
+
+bb26:                                             ; preds = %bb77, %0
+; CHECK: 4 = MemoryPhi({%0,liveOnEntry},{bb77,2})
+; CHECK: MemoryUse(4)
+; CHECK-NEXT: load i64*, i64** %tmp25, align 8
+  load i64*, i64** %tmp25, align 8
+  br i1 undef, label %bb68, label %bb77
+
+bb68:                                             ; preds = %bb26
+; CHECK: MemoryUse(4)
+; CHECK-NEXT: %tmp69 = load i64, i64* %g, align 8
+  %tmp69 = load i64, i64* %g, align 8
+; CHECK: 1 = MemoryDef(4)
+; CHECK-NEXT: store i64 %tmp69, i64* %g, align 8
+  store i64 %tmp69, i64* %g, align 8
+  br label %bb77
+
+bb77:                                             ; preds = %bb68, %bb26
+; CHECK: 3 = MemoryPhi({bb26,4},{bb68,1})
+; CHECK: 2 = MemoryDef(3)
+; CHECK-NEXT: store i64* null, i64** %tmp25, align 8
+  store i64* null, i64** %tmp25, align 8
+  br label %bb26
+}
+
+; CHECK-LABEL: define void @quux_nodominate
+define void @quux_nodominate(%struct.hoge* noalias %f, i64* noalias %g) align 2 {
+  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
+  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
+  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
+  br label %bb26
+
+bb26:                                             ; preds = %bb77, %0
+; CHECK: 3 = MemoryPhi({%0,liveOnEntry},{bb77,2})
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NEXT: load i64*, i64** %tmp25, align 8
+  load i64*, i64** %tmp25, align 8
+  br i1 undef, label %bb68, label %bb77
+
+bb68:                                             ; preds = %bb26
+; CHECK: MemoryUse(3)
+; CHECK-NEXT: %tmp69 = load i64, i64* %g, align 8
+  %tmp69 = load i64, i64* %g, align 8
+; CHECK: 1 = MemoryDef(3)
+; CHECK-NEXT: store i64 %tmp69, i64* %g, align 8
+  store i64 %tmp69, i64* %g, align 8
+  br label %bb77
+
+bb77:                                             ; preds = %bb68, %bb26
+; CHECK: 2 = MemoryPhi({bb26,3},{bb68,1})
+; CHECK-NEXT: br label %bb26
+  br label %bb26
+}
Index: test/Transforms/Util/MemorySSA/phi-translation.ll
===================================================================
--- /dev/null
+++ test/Transforms/Util/MemorySSA/phi-translation.ll
@@ -0,0 +1,87 @@
+; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
+
+; %ptr can't alias %local, so we should be able to optimize the use of %local to
+; point to the store to %local.
+; CHECK-LABEL: define void @check
+define void @check(i8* %ptr, i1 %bool) {
+entry:
+  %local = alloca i8, align 1
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i8 0, i8* %local, align 1
+  store i8 0, i8* %local, align 1
+  br i1 %bool, label %if.then, label %if.end
+
+if.then:
+  %p2 = getelementptr inbounds i8, i8* %ptr, i32 1
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: store i8 0, i8* %p2, align 1
+  store i8 0, i8* %p2, align 1
+  br label %if.end
+
+if.end:
+; CHECK: 3 = MemoryPhi({entry,1},{if.then,2})
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load i8, i8* %local, align 1
+  load i8, i8* %local, align 1
+  ret void
+}
+
+; This case is a bit more interesting, and exists specifically to test the
+; caching walker's walking algorithm.
+;
+; The phi structure is:
+;
+;   2 -> 3
+;   ^    ^
+;    \  /
+;     1
+;
+; Where each number is a phi, each arrow is a directed edge from that phi to
+; another phi, and the phi number is the order that phi gets visited in by the
+; walker.
+;
+; We want to optimize a use that uses phi 1 to use phi 3. This case is
+; interesting because it hits the case where we've already visited the phi,
+; and no phi translation has occurred. So, we need to fall back to our cache.
+;
+; CHECK-LABEL: define void @check2
+define void @check2(i1 %val1, i1 %val2, i1 %val3) {
+entry:
+  %local = alloca i8, align 1
+  %local2 = alloca i8, align 1
+
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i8 0, i8* %local
+  store i8 0, i8* %local
+  br i1 %val1, label %if.then, label %phi.3
+
+  ; We need if.then so phi.3 starts with an actual phi.
+if.then:
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: store i8 2, i8* %local2
+  store i8 2, i8* %local2
+  br i1 %val2, label %phi.2, label %phi.3
+
+phi.3:
+; CHECK: 6 = MemoryPhi({entry,1},{if.then,2})
+; CHECK: 3 = MemoryDef(6)
+; CHECK-NEXT: store i8 3, i8* %local2
+  store i8 3, i8* %local2
+  br i1 %val3, label %phi.2, label %phi.1
+
+phi.2:
+; CHECK: 5 = MemoryPhi({if.then,2},{phi.3,3})
+; CHECK: 4 = MemoryDef(5)
+; CHECK-NEXT: store i8 4, i8* %local2
+  store i8 4, i8* %local2
+  br label %phi.1
+
+phi.1:
+; Order matters here; phi.2 needs to come before phi.3, because that's the order
+; they're visited in.
+; CHECK: 7 = MemoryPhi({phi.2,4},{phi.3,3})
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load i8, i8* %local
+  load i8, i8* %local
+  ret void
+}