Index: llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp
+++ llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -356,6 +356,9 @@
     unsigned Generation = 0;
     int MatchingId = -1;
     bool IsAtomic = false;
+
+    // TODO: Remove this flag.  It would be strictly stronger to add a record
+    // to the AvailableInvariant table when passing the invariant load instead.
     bool IsInvariant = false;
 
     LoadValue() = default;
@@ -373,6 +376,17 @@
                       LoadMapAllocator>;
 
   LoadHTType AvailableLoads;
+  
+  // A scoped hash table mapping memory locations (represented as typed
+  // addresses) to generation numbers at which that memory location became
+  // (henceforth indefinitely) invariant.
+  using InvariantMapAllocator =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<MemoryLocation, unsigned>>;
+  using InvariantHTType =
+      ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+                      InvariantMapAllocator>;
+  InvariantHTType AvailableInvariants;
 
   /// \brief A scoped hash table of the current values of read-only call
   /// values.
@@ -401,15 +415,16 @@
   class NodeScope {
   public:
     NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              CallHTType &AvailableCalls)
-        : Scope(AvailableValues), LoadScope(AvailableLoads),
-          CallScope(AvailableCalls) {}
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+      : Scope(AvailableValues), LoadScope(AvailableLoads),
+        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
     NodeScope(const NodeScope &) = delete;
     NodeScope &operator=(const NodeScope &) = delete;
 
   private:
     ScopedHTType::ScopeTy Scope;
     LoadHTType::ScopeTy LoadScope;
+    InvariantHTType::ScopeTy InvariantScope;
     CallHTType::ScopeTy CallScope;
   };
 
@@ -420,10 +435,13 @@
   class StackNode {
   public:
     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
-              DomTreeNode::iterator child, DomTreeNode::iterator end)
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+              unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
+              DomTreeNode::iterator end)
         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
-          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+          EndIter(end),
+          Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+                 AvailableCalls)
           {}
     StackNode(const StackNode &) = delete;
     StackNode &operator=(const StackNode &) = delete;
@@ -563,6 +581,10 @@
                                                  ExpectedType);
   }
 
+  /// Return true if the instruction is known to only operate on memory
+  /// provably invariant in the given "generation".
+  bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
   bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
                            Instruction *EarlierInst, Instruction *LaterInst);
 
@@ -656,6 +678,27 @@
   return MSSA->dominates(LaterDef, EarlierMA);
 }
 
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+  // A location loaded from with an invariant_load is assumed to *never* change
+  // within the visible scope of the compilation.
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+  auto MemLocOpt = MemoryLocation::getOrNone(I);
+  if (!MemLocOpt)
+    // "target" intrinsic forms of loads aren't currently known to
+    // MemoryLocation::get.  TODO
+    return false;
+  MemoryLocation MemLoc = *MemLocOpt;
+  if (!AvailableInvariants.count(MemLoc))
+    return false;
+
+  // Is the generation at which this became invariant older than the
+  // current one?
+  return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
 bool EarlyCSE::processNode(DomTreeNode *Node) {
   bool Changed = false;
   BasicBlock *BB = Node->getBlock();
@@ -741,18 +784,28 @@
       continue;
     }
 
-    // Skip invariant.start intrinsics since they only read memory, and we can
-    // forward values across it. Also, we dont need to consume the last store
-    // since the semantics of invariant.start allow us to perform DSE of the
-    // last store, if there was a store following invariant.start. Consider:
+    // We can skip all invariant.start intrinsics since they only read memory,
+    // and we can forward values across it. For invariant starts without
+    // invariant ends, we can use the fact that the invariantness never ends to
+    // start a scope in the current generaton which is true for all future
+    // generations.  Also, we dont need to consume the last store since the
+    // semantics of invariant.start allow us to perform   DSE of the last
+    // store, if there was a store following invariant.start. Consider: 
     //
     // store 30, i8* p
     // invariant.start(p)
     // store 40, i8* p
     // We can DSE the store to 30, since the store 40 to invariant location p
     // causes undefined behaviour.
-    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+      // If there are any uses, the scope might end.  
+      if (!Inst->use_empty())
+        continue;
+      auto *CI = cast<CallInst>(Inst);
+      MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+      AvailableInvariants.insert(MemLoc, CurrentGeneration);
       continue;
+    }
 
     if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
       if (auto *CondI =
@@ -850,7 +903,8 @@
           !MemInst.isVolatile() && MemInst.isUnordered() &&
           // We can't replace an atomic load with one which isn't also atomic.
           InVal.IsAtomic >= MemInst.isAtomic() &&
-          (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+          (InVal.IsInvariant ||
+           isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
            isSameMemGeneration(InVal.Generation, CurrentGeneration,
                                InVal.DefInst, Inst))) {
         Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
@@ -934,8 +988,9 @@
           InVal.MatchingId == MemInst.getMatchingId() &&
           // We don't yet handle removing stores with ordering of any kind.
           !MemInst.isVolatile() && MemInst.isUnordered() &&
-          isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                              InVal.DefInst, Inst)) {
+          (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+           isSameMemGeneration(InVal.Generation, CurrentGeneration,
+                               InVal.DefInst, Inst))) {
         // It is okay to have a LastStore to a different pointer here if MemorySSA
         // tells us that the load and store are from the same memory generation.
         // In that case, LastStore should keep its present value since we're
@@ -1027,8 +1082,9 @@
 
   // Process the root node.
   nodesToProcess.push_back(new StackNode(
-      AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
-      DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+      AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+      CurrentGeneration, DT.getRootNode(),
+      DT.getRootNode()->begin(), DT.getRootNode()->end()));
 
   // Save the current generation.
   unsigned LiveOutGeneration = CurrentGeneration;
@@ -1052,9 +1108,9 @@
       // Push the next child onto the stack.
       DomTreeNode *child = NodeToProcess->nextChild();
       nodesToProcess.push_back(
-          new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
-                        NodeToProcess->childGeneration(), child, child->begin(),
-                        child->end()));
+          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+                        AvailableCalls, NodeToProcess->childGeneration(),
+                        child, child->begin(), child->end()));
     } else {
       // It has been processed, and there are no more children to process,
       // so delete it and pop it off the stack.
Index: llvm/trunk/test/Transforms/EarlyCSE/invariant-loads.ll
===================================================================
--- llvm/trunk/test/Transforms/EarlyCSE/invariant-loads.ll
+++ llvm/trunk/test/Transforms/EarlyCSE/invariant-loads.ll
@@ -94,3 +94,12 @@
   call void @clobber_and_use(i32 %val1)
   ret void
 }
+
+define void @test_false_negative_dse(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_false_negative_dse
+; CHECK: store
+  %v1 = load i32, i32* %p, !invariant.load !{}
+  call void @clobber_and_use(i32 %v1)
+  store i32 %v1, i32* %p
+  ret void
+}
Index: llvm/trunk/test/Transforms/EarlyCSE/invariant.start.ll
===================================================================
--- llvm/trunk/test/Transforms/EarlyCSE/invariant.start.ll
+++ llvm/trunk/test/Transforms/EarlyCSE/invariant.start.ll
@@ -6,13 +6,12 @@
 
 ; Check that we do load-load forwarding over invariant.start, since it does not
 ; clobber memory
-define i8 @test1(i8 *%P) {
-  ; CHECK-LABEL: @test1(
+define i8 @test_bypass1(i8 *%P) {
+  ; CHECK-LABEL: @test_bypass1(
   ; CHECK-NEXT: %V1 = load i8, i8* %P
   ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
   ; CHECK-NEXT: ret i8 0
 
-
   %V1 = load i8, i8* %P
   %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
   %V2 = load i8, i8* %P
@@ -22,13 +21,12 @@
 
 
 ; Trivial Store->load forwarding over invariant.start
-define i8 @test2(i8 *%P) {
-  ; CHECK-LABEL: @test2(
+define i8 @test_bypass2(i8 *%P) {
+  ; CHECK-LABEL: @test_bypass2(
   ; CHECK-NEXT: store i8 42, i8* %P
   ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
   ; CHECK-NEXT: ret i8 42
 
-
   store i8 42, i8* %P
   %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
   %V1 = load i8, i8* %P
@@ -38,13 +36,11 @@
 ; We can DSE over invariant.start calls, since the first store to
 ; %P is valid, and the second store is actually unreachable based on semantics
 ; of invariant.start.
-define void @test3(i8* %P) {
-
-; CHECK-LABEL: @test3(
+define void @test_bypass3(i8* %P) {
+; CHECK-LABEL: @test_bypass3(
 ; CHECK-NEXT:  %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
 ; CHECK-NEXT:  store i8 60, i8* %P
 
-
   store i8 50, i8* %P
   %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
   store i8 60, i8* %P
@@ -54,9 +50,9 @@
 
 ; FIXME: Now the first store can actually be eliminated, since there is no read within
 ; the invariant region, between start and end.
-define void @test4(i8* %P) {
+define void @test_bypass4(i8* %P) {
 
-; CHECK-LABEL: @test4(
+; CHECK-LABEL: @test_bypass4(
 ; CHECK-NEXT: store i8 50, i8* %P
 ; CHECK-NEXT:  %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
 ; CHECK-NEXT: call void @llvm.invariant.end.p0i8({}* %i, i64 1, i8* %P)
@@ -69,3 +65,214 @@
   store i8 60, i8* %P
   ret void
 }
+
+
+declare void @clobber()
+declare {}* @llvm.invariant.start.p0i32(i64 %size, i32* nocapture %ptr)
+declare void @llvm.invariant.end.p0i32({}*, i64, i32* nocapture) nounwind
+
+define i32 @test_before_load(i32* %p) {
+; CHECK-LABEL: @test_before_load
+; CHECK: ret i32 0
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_before_clobber(i32* %p) {
+; CHECK-LABEL: @test_before_clobber
+; CHECK: ret i32 0
+  %v1 = load i32, i32* %p
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_unanalzyable_load(i32* %p) {
+; CHECK-LABEL: @test_unanalzyable_load
+; CHECK: ret i32 0
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_after_clobber(i32* %p) {
+; CHECK-LABEL: @test_negative_after_clobber
+; CHECK: ret i32 %sub
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_merge(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_merge
+; CHECK: ret i32 0
+  %v1 = load i32, i32* %p
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call void @clobber()
+  br label %merge
+merge:
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_after_mergeclobber(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_negative_after_mergeclobber
+; CHECK: ret i32 %sub
+  %v1 = load i32, i32* %p
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call void @clobber()
+  br label %merge
+merge:
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+; In theory, this version could work, but earlycse is incapable of
+; merging facts along distinct paths.  
+define i32 @test_false_negative_merge(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_false_negative_merge
+; CHECK: ret i32 %sub
+  %v1 = load i32, i32* %p
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  br label %merge
+merge:
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_merge_unanalyzable_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_merge_unanalyzable_load
+; CHECK: ret i32 0
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  %v1 = load i32, i32* %p
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call void @clobber()
+  br label %merge
+merge:
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define void @test_dse_before_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_dse_before_load
+; CHECK-NOT: store
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  store i32 %v1, i32* %p
+  ret void
+}
+
+define void @test_dse_after_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_dse_after_load
+; CHECK-NOT: store
+  %v1 = load i32, i32* %p
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  store i32 %v1, i32* %p
+  ret void
+}
+
+
+; In this case, we have a false negative since MemoryLocation is implicitly
+; typed due to the user of a Value to represent the address.  Note that other
+; passes will canonicalize away the bitcasts in this example.
+define i32 @test_false_negative_types(i32* %p) {
+; CHECK-LABEL: @test_false_negative_types
+; CHECK: ret i32 %sub
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %pf = bitcast i32* %p to float*
+  %v2f = load float, float* %pf
+  %v2 = bitcast float %v2f to i32
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_size1(i32* %p) {
+; CHECK-LABEL: @test_negative_size1
+; CHECK: ret i32 %sub
+  call {}* @llvm.invariant.start.p0i32(i64 3, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_size2(i32* %p) {
+; CHECK-LABEL: @test_negative_size2
+; CHECK: ret i32 %sub
+  call {}* @llvm.invariant.start.p0i32(i64 0, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_scope(i32* %p) {
+; CHECK-LABEL: @test_negative_scope
+; CHECK: ret i32 %sub
+  %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_false_negative_scope(i32* %p) {
+; CHECK-LABEL: @test_false_negative_scope
+; CHECK: ret i32 %sub
+  %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+; Invariant load defact starts an invariant.start scope of the appropriate size
+define i32 @test_invariant_load_scope(i32* %p) {
+; CHECK-LABEL: @test_invariant_load_scope
+; CHECK: ret i32 0
+  %v1 = load i32, i32* %p, !invariant.load !{}
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}