Index: lib/Transforms/Utils/MemorySSA.cpp
===================================================================
--- lib/Transforms/Utils/MemorySSA.cpp
+++ lib/Transforms/Utils/MemorySSA.cpp
@@ -851,14 +851,70 @@
   return Result;
 }
 
+enum class Reorderability {
+  Always,
+  IfNoAlias,
+  Never
+};
+
+/// This does one-way checks to see if Use could theoretically be hoisted above
+/// MayClobber. This will not check the other way around.
+///
+/// This assumes that, for the purposes of MemorySSA, Use comes directly after
+/// MayClobber, with no potentially clobbering operations in between them.
+/// (Where potentially clobbering ops are memory barriers, aliased stores, etc.)
+static Reorderability getLoadReorderability(const LoadInst *Use,
+                                            const LoadInst *MayClobber) {
+  // Volatile operations may never be reordered with other volatile operations.
+  if (Use->isVolatile() && MayClobber->isVolatile())
+    return Reorderability::Never;
+
+  // The lang ref allows reordering of volatile and non-volatile operations.
+  // Whether an aliasing nonvolatile load and volatile load can be reordered,
+  // though, is ambiguous. Because it may not be best to exploit this ambiguity,
+  // we only allow volatile/non-volatile reordering if the volatile and
+  // non-volatile operations don't alias.
+  Reorderability Result = Reorderability::Always;
+  if (Use->isVolatile() || MayClobber->isVolatile())
+    Result = Reorderability::IfNoAlias;
+
+  // If a load is seq_cst, it cannot be moved above other loads. If its ordering
+  // is weaker, it can be moved above other loads. We just need to be sure that
+  // MayClobber isn't an acquire load, because loads can't be moved above
+  // acquire loads.
+  //
+  // Note that this explicitly *does* allow the free reordering of monotonic (or
+  // weaker) loads of the same address.
+  bool UseIsSeqCst = Use->getOrdering() == SequentiallyConsistent;
+  bool MayClobberIsAcquire = MayClobber->getOrdering() >= Acquire;
+  if (UseIsSeqCst || MayClobberIsAcquire)
+    return Reorderability::Never;
+  return Result;
+}
+
 bool CachingMemorySSAWalker::instructionClobbersQuery(
     const MemoryDef *MD, UpwardsMemoryQuery &Q,
     const MemoryLocation &Loc) const {
   Instruction *DefMemoryInst = MD->getMemoryInst();
   assert(DefMemoryInst && "Defining instruction not actually an instruction");
 
-  if (!Q.IsCall)
+  if (!Q.IsCall) {
+    const Instruction *UseMemoryInst = Q.Inst;
+    auto *DefLoad = dyn_cast<LoadInst>(DefMemoryInst);
+    auto *UseLoad = dyn_cast<LoadInst>(UseMemoryInst);
+    // If load A cannot be reordered above load B, then load B clobbers load A.
+    if (DefLoad && UseLoad) {
+      switch (getLoadReorderability(UseLoad, DefLoad)) {
+      case Reorderability::Always:
+        return false;
+      case Reorderability::Never:
+        return true;
+      case Reorderability::IfNoAlias:
+        return !AA->isNoAlias(Loc, MemoryLocation::get(DefLoad));
+      }
+    }
     return AA->getModRefInfo(DefMemoryInst, Loc) & MRI_Mod;
+  }
 
   // If this is a call, mark it for caching
   if (ImmutableCallSite(DefMemoryInst))
@@ -1058,6 +1114,18 @@
   if (auto CacheResult = doCacheLookup(StartingAccess, Q, Q.StartingLoc))
     return CacheResult;
 
+  // If the memory can't be changed, then loads of the memory can't be
+  // clobbered.
+  //
+  // FIXME: We should handle invariant groups, as well. It's a bit harder,
+  // because we need to pay close attention to invariant group barriers.
+  if (isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
+                           AA->pointsToConstantMemory(I))) {
+    MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
+    doCacheInsert(StartingAccess, LiveOnEntry, Q, Q.StartingLoc);
+    return LiveOnEntry;
+  }
+
   // Start with the thing we already think clobbers this location
   MemoryAccess *DefiningAccess = StartingAccess->getDefiningAccess();
 
Index: test/Transforms/Util/MemorySSA/atomic-clobber.ll
===================================================================
--- test/Transforms/Util/MemorySSA/atomic-clobber.ll
+++ test/Transforms/Util/MemorySSA/atomic-clobber.ll
@@ -2,6 +2,7 @@
 ;
 ; Ensures that atomic loads count as MemoryDefs
 
+; CHECK-LABEL: define i32 @foo
 define i32 @foo(i32* %a, i32* %b) {
 ; CHECK: 1 = MemoryDef(liveOnEntry)
 ; CHECK-NEXT: store i32 4
@@ -15,3 +16,103 @@
   %3 = add i32 %1, %2
   ret i32 %3
 }
+
+; CHECK-LABEL: define void @bar
+define void @bar(i32* %a) {
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4
+  load atomic i32, i32* %a unordered, align 4
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: load atomic i32, i32* %a monotonic, align 4
+  load atomic i32, i32* %a monotonic, align 4
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: load atomic i32, i32* %a acquire, align 4
+  load atomic i32, i32* %a acquire, align 4
+; CHECK: 3 = MemoryDef(2)
+; CHECK-NEXT: load atomic i32, i32* %a seq_cst, align 4
+  load atomic i32, i32* %a seq_cst, align 4
+  ret void
+}
+
+; CHECK-LABEL: define void @baz
+define void @baz(i32* %a) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: %1 = load atomic i32
+  %1 = load atomic i32, i32* %a acquire, align 4
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: %2 = load atomic i32, i32* %a unordered, align 4
+  %2 = load atomic i32, i32* %a unordered, align 4
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: %3 = load atomic i32, i32* %a monotonic, align 4
+  %3 = load atomic i32, i32* %a monotonic, align 4
+  ret void
+}
+
+; CHECK-LABEL: define void @fences
+define void @fences(i32* %a) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: fence acquire
+  fence acquire
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: %1 = load i32, i32* %a
+  %1 = load i32, i32* %a
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: fence release
+  fence release
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: %2 = load i32, i32* %a
+  %2 = load i32, i32* %a
+
+; CHECK: 3 = MemoryDef(2)
+; CHECK-NEXT: fence acq_rel
+  fence acq_rel
+; CHECK: MemoryUse(3)
+; CHECK-NEXT: %3 = load i32, i32* %a
+  %3 = load i32, i32* %a
+
+; CHECK: 4 = MemoryDef(3)
+; CHECK-NEXT: fence seq_cst
+  fence seq_cst
+; CHECK: MemoryUse(4)
+; CHECK-NEXT: %4 = load i32, i32* %a
+  %4 = load i32, i32* %a
+  ret void
+}
+
+; CHECK-LABEL: define void @seq_cst_clobber
+define void @seq_cst_clobber(i32* noalias %a, i32* noalias %b) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: %1 = load atomic i32, i32* %a monotonic, align 4
+  load atomic i32, i32* %a monotonic, align 4
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: %2 = load atomic i32, i32* %a seq_cst, align 4
+  load atomic i32, i32* %a seq_cst, align 4
+
+; CHECK: 3 = MemoryDef(2)
+; CHECK-NEXT: load atomic i32, i32* %a monotonic, align 4
+  load atomic i32, i32* %a monotonic, align 4
+
+  ret void
+}
+
+; Ensure that AA hands us MRI_Mod on unreorderable atomic ops.
+;
+; This test is a bit implementation-specific. In particular, it depends on that
+; we pass cmpxchg-load queries to AA, without trying to reason about them on
+; our own.
+;
+; If AA gets more aggressive, we can find another way.
+;
+; CHECK-LABEL: define void @check_aa_is_sane
+define void @check_aa_is_sane(i32* noalias %a, i32* noalias %b) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: cmpxchg i32* %a, i32 0, i32 1 acquire acquire
+  cmpxchg i32* %a, i32 0, i32 1 acquire acquire
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load i32, i32* %b, align 4
+  load i32, i32* %b, align 4
+
+  ret void
+}
Index: test/Transforms/Util/MemorySSA/constant-memory.ll
===================================================================
--- /dev/null
+++ test/Transforms/Util/MemorySSA/constant-memory.ll
@@ -0,0 +1,41 @@
+; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
+;
+; Things that BasicAA can prove points to constant memory should be
+; liveOnEntry, as well.
+
+declare void @clobberAllTheThings()
+
+@str = private unnamed_addr constant [2 x i8] c"hi"
+
+define i8 @foo() {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: call void @clobberAllTheThings()
+  call void @clobberAllTheThings()
+  %1 = getelementptr [2 x i8], [2 x i8]* @str, i64 0, i64 0
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NEXT: %2 = load i8
+  %2 = load i8, i8* %1, align 1
+  %3 = getelementptr [2 x i8], [2 x i8]* @str, i64 0, i64 1
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NEXT: %4 = load i8
+  %4 = load i8, i8* %3, align 1
+  %5 = add i8 %2, %4
+  ret i8 %5
+}
+
+define i8 @select(i1 %b) {
+  %1 = alloca i8, align 1
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i8 0
+  store i8 0, i8* %1, align 1
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: call void @clobberAllTheThings()
+  call void @clobberAllTheThings()
+  %2 = getelementptr [2 x i8], [2 x i8]* @str, i64 0, i64 0
+  %3 = select i1 %b, i8* %2, i8* %1
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: %4 = load i8
+  %4 = load i8, i8* %3, align 1
+  ret i8 %4
+}
Index: test/Transforms/Util/MemorySSA/invariant-groups.ll
===================================================================
--- /dev/null
+++ test/Transforms/Util/MemorySSA/invariant-groups.ll
@@ -0,0 +1,30 @@
+; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
+;
+; Currently, MemorySSA doesn't support invariant groups. So, we should ignore
+; invariant.group.barrier intrinsics entirely. We'll need to pay attention to
+; them when/if we decide to support invariant groups.
+
+@g = external global i32
+
+define i32 @foo(i32* %a) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i32 0
+  store i32 0, i32* %a, align 4, !llvm.invariant.group !0
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: store i32 1
+  store i32 1, i32* @g, align 4
+
+  %1 = bitcast i32* %a to i8*
+  %a8 = call i8* @llvm.invariant.group.barrier(i8* %1)
+  %a32 = bitcast i8* %a8 to i32*
+
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: %2 = load i32
+  %2 = load i32, i32* %a32, align 4, !llvm.invariant.group !0
+  ret i32 %2
+}
+
+declare i8* @llvm.invariant.group.barrier(i8*)
+
+!0 = !{!"group1"}
Index: test/Transforms/Util/MemorySSA/load-invariant.ll
===================================================================
--- test/Transforms/Util/MemorySSA/load-invariant.ll
+++ test/Transforms/Util/MemorySSA/load-invariant.ll
@@ -1,11 +1,7 @@
-; XFAIL: *
 ; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
 ;
 ; Invariant loads should be considered live on entry, because, once the
 ; location is known to be dereferenceable, the value can never change.
-;
-; Currently XFAILed because this optimization was held back from the initial
-; commit.
 
 @g = external global i32
 
@@ -21,4 +17,19 @@
   ret i32 %1
 }
 
+define i32 @bar(i32* %a) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: call void @clobberAllTheThings()
+  call void @clobberAllTheThings()
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: %1 = load atomic i32
+  %1 = load atomic i32, i32* %a acquire, align 4, !invariant.load !0
+
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: %2 = load i32
+  %2 = load i32, i32* %a, align 4
+  ret i32 %2
+}
+
 !0 = !{}
Index: test/Transforms/Util/MemorySSA/volatile-clobber.ll
===================================================================
--- test/Transforms/Util/MemorySSA/volatile-clobber.ll
+++ test/Transforms/Util/MemorySSA/volatile-clobber.ll
@@ -19,3 +19,74 @@
   %4 = add i32 %3, %2
   ret i32 %4
 }
+
+; Ensuring that we don't automatically hoist nonvolatile loads around volatile
+; loads
+; CHECK-LABEL define void @volatile_only
+define void @volatile_only(i32* %arg1, i32* %arg2) {
+  ; Trivially NoAlias/MustAlias
+  %a = alloca i32
+  %b = alloca i32
+
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: load volatile i32, i32* %a
+  load volatile i32, i32* %a
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NEXT: load i32, i32* %b
+  load i32, i32* %b
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load i32, i32* %a
+  load i32, i32* %a
+
+  ; MayAlias
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: load volatile i32, i32* %arg1
+  load volatile i32, i32* %arg1
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: load i32, i32* %arg2
+  load i32, i32* %arg2
+
+  ret void
+}
+
+; Ensuring that volatile atomic operations work properly.
+; CHECK-LABEL define void @volatile_atomics
+define void @volatile_atomics(i32* %arg1, i32* %arg2) {
+  %a = alloca i32
+  %b = alloca i32
+
+  ; Trivially NoAlias/MustAlias
+
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: load atomic volatile i32, i32* %a acquire, align 4
+  load atomic volatile i32, i32* %a acquire, align 4
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load i32, i32* %b
+  load i32, i32* %b
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: load atomic volatile i32, i32* %a monotonic, align 4
+  load atomic volatile i32, i32* %a monotonic, align 4
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load i32, i32* %b
+  load i32, i32* %b
+; CHECK: MemoryUse(1)
+; CHECK-NEXT: load atomic i32, i32* %b unordered, align 4
+  load atomic i32, i32* %b unordered, align 4
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4
+  load atomic i32, i32* %a unordered, align 4
+; CHECK: MemoryUse(2)
+; CHECK-NEXT: load i32, i32* %a
+  load i32, i32* %a
+
+  ; MayAlias
+; CHECK: 3 = MemoryDef(2)
+; CHECK-NEXT: load atomic volatile i32, i32* %arg1 monotonic, align 4
+  load atomic volatile i32, i32* %arg1 monotonic, align 4
+; CHECK: MemoryUse(3)
+; CHECK-NEXT: load i32, i32* %arg2
+  load i32, i32* %arg2
+
+  ret void
+}