Index: include/llvm/Analysis/MemoryDependenceAnalysis.h
===================================================================
--- include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -302,6 +302,10 @@
     NonLocalPointerInfo() : Size(MemoryLocation::UnknownSize) {}
   };
 
+  /// Cache storing single nonlocal def for the instruction.
+  /// It is set when nonlocal def would be found in function returning only
+  /// local dependencies.
+  DenseMap<Instruction *, NonLocalDepResult> NonLocalDefsCache;
   /// This map stores the cached results of doing a pointer lookup at the
   /// bottom of a block.
   ///
@@ -441,9 +445,9 @@
   /// This analysis looks for other loads and stores with invariant.group
   /// metadata and the same pointer operand. Returns Unknown if it does not
   /// find anything, and Def if it can be assumed that 2 instructions load or
-  /// store the same value.
-  /// FIXME: This analysis works only on single block because of restrictions
-  /// at the call site.
+  /// store the same value and NonLocal which indicate that non-local Def was
+  /// found, which can be retrieved by calling getNonLocalPointerDependency
+  /// with the same queried instruction.
   MemDepResult getInvariantGroupPointerDependency(LoadInst *LI, BasicBlock *BB);
 
   /// Looks at a memory location for a load (specified by MemLocBase, Offs, and
Index: lib/Analysis/MemoryDependenceAnalysis.cpp
===================================================================
--- lib/Analysis/MemoryDependenceAnalysis.cpp
+++ lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -323,17 +323,27 @@
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
 
+  MemDepResult InvariantGroupDependency = MemDepResult::getUnknown();
   if (QueryInst != nullptr) {
     if (auto *LI = dyn_cast<LoadInst>(QueryInst)) {
-      MemDepResult invariantGroupDependency =
-          getInvariantGroupPointerDependency(LI, BB);
+      InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB);
 
-      if (invariantGroupDependency.isDef())
-        return invariantGroupDependency;
+      if (InvariantGroupDependency.isDef())
+        return InvariantGroupDependency;
     }
   }
-  return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst,
-                                        Limit);
+  MemDepResult SimpleDep = getSimplePointerDependencyFrom(
+      MemLoc, isLoad, ScanIt, BB, QueryInst, Limit);
+  if (SimpleDep.isDef())
+    return SimpleDep;
+  // Non-local invariant group dependency indicates there is non local Def,
+  // which is better than local clobber and everything else.
+  if (InvariantGroupDependency.isNonLocal())
+    return InvariantGroupDependency;
+
+  assert(InvariantGroupDependency.isUnknown() &&
+         "InvariantGroupDependency should be only unknown at this point");
+  return SimpleDep;
 }
 
 MemDepResult
@@ -358,6 +368,17 @@
   // Queue to process all pointers that are equivalent to load operand.
   SmallVector<const Value *, 8> LoadOperandsQueue;
   LoadOperandsQueue.push_back(LoadOperand);
+
+  Instruction *ClosestDependency = nullptr;
+  // Order of instructions in uses list is unpredictible. In order to always
+  // get the same result, we will look for the closest dominance.
+  auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) {
+    assert(Other && "Must call it with not null instruction");
+    if (Best == nullptr || DT.dominates(Best, Other))
+      return Other;
+    return Best;
+  };
+
   while (!LoadOperandsQueue.empty()) {
     const Value *Ptr = LoadOperandsQueue.pop_back_val();
     assert(Ptr && !isa<GlobalValue>(Ptr) &&
@@ -388,12 +409,24 @@
       // If we hit load/store with the same invariant.group metadata (and the
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
-      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) && U->getParent() == BB &&
+      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
           U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD)
-        return MemDepResult::getDef(U);
+        ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
   }
-  return MemDepResult::getUnknown();
+
+  if (!ClosestDependency)
+    return MemDepResult::getUnknown();
+  if (ClosestDependency->getParent() == BB)
+    return MemDepResult::getDef(ClosestDependency);
+  // Def(U) can't be returned here because it is non-local. If local
+  // dependency won't be found then return nonLocal counting that the
+  // user will call getNonLocalPointerDependency, which will return cached
+  // result.
+  NonLocalDefsCache.try_emplace(
+      LI, NonLocalDepResult(ClosestDependency->getParent(),
+                            MemDepResult::getDef(ClosestDependency), nullptr));
+  return MemDepResult::getNonLocal();
 }
 
 MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
@@ -877,7 +910,14 @@
   assert(Loc.Ptr->getType()->isPointerTy() &&
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
-
+  {
+    auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst);
+    if (NonLocalDefIt != NonLocalDefsCache.end()) {
+      Result.push_back(std::move(NonLocalDefIt->second));
+      NonLocalDefsCache.erase(NonLocalDefIt);
+      return;
+    }
+  }
   // This routine does not expect to deal with volatile instructions.
   // Doing so would require piping through the QueryInst all the way through.
   // TODO: volatiles can't be elided, but they can be reordered with other
Index: test/Transforms/GVN/assume-equal.ll
===================================================================
--- test/Transforms/GVN/assume-equal.ll
+++ test/Transforms/GVN/assume-equal.ll
@@ -65,22 +65,20 @@
   %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
   %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
-; FIXME: those loads could be also direct, but right now the invariant.group
-; analysis works only on single block
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %callx = tail call i32 %call1(%struct.A* %0) #1
   
   %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
   %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %cally = tail call i32 %call4(%struct.A* %0) #1
   
   %b = bitcast i8* %call to %struct.A**
   %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
   %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
   %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %unknown = tail call i32 %vfun(%struct.A* %0) #1
   
   br label %if.end
Index: test/Transforms/GVN/invariant.group.ll
===================================================================
--- test/Transforms/GVN/invariant.group.ll
+++ test/Transforms/GVN/invariant.group.ll
@@ -392,6 +392,44 @@
    ret void
 }
 
+; CHECK-LABEL: define void @handling_loops()
+define void @handling_loops() {
+  %a = alloca %struct.A, align 8
+  %1 = bitcast %struct.A* %a to i8*
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
+  %3 = load i8, i8* @unknownPtr, align 4
+  %4 = icmp sgt i8 %3, 0
+  br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
+
+.lr.ph.i:                                         ; preds = %0
+  %5 = bitcast %struct.A* %a to void (%struct.A*)***
+  %6 = load i8, i8* @unknownPtr, align 4
+  %7 = icmp sgt i8 %6, 1
+  br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
+
+._crit_edge.preheader:                            ; preds = %.lr.ph.i
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.preheader, %._crit_edge
+  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
+  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
+  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
+  ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  call void %9(%struct.A* nonnull %a) #3
+  ; CHECK-NOT: call void %
+  %10 = add nuw nsw i8 %8, 1
+  %11 = load i8, i8* @unknownPtr, align 4
+  %12 = icmp slt i8 %10, %11
+  br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
+
+_Z2g2R1A.exit.loopexit:                           ; preds = %._crit_edge
+  br label %_Z2g2R1A.exit
+
+_Z2g2R1A.exit:                                    ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
+  ret void
+}
+
 
 declare void @foo(i8*)
 declare void @foo2(i8*, i8)
Index: test/Transforms/NewGVN/assume-equal.ll
===================================================================
--- test/Transforms/NewGVN/assume-equal.ll
+++ test/Transforms/NewGVN/assume-equal.ll
@@ -66,22 +66,20 @@
   %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
   %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
-; FIXME: those loads could be also direct, but right now the invariant.group
-; analysis works only on single block
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %callx = tail call i32 %call1(%struct.A* %0) #1
   
   %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
   %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %cally = tail call i32 %call4(%struct.A* %0) #1
   
   %b = bitcast i8* %call to %struct.A**
   %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
   %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
   %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %unknown = tail call i32 %vfun(%struct.A* %0) #1
   
   br label %if.end
Index: test/Transforms/NewGVN/invariant.group.ll
===================================================================
--- test/Transforms/NewGVN/invariant.group.ll
+++ test/Transforms/NewGVN/invariant.group.ll
@@ -393,6 +393,45 @@
    ret void
 }
 
+; CHECK-LABEL: define void @handling_loops()
+define void @handling_loops() {
+  %a = alloca %struct.A, align 8
+  %1 = bitcast %struct.A* %a to i8*
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
+  %3 = load i8, i8* @unknownPtr, align 4
+  %4 = icmp sgt i8 %3, 0
+  br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
+
+.lr.ph.i:                                         ; preds = %0
+  %5 = bitcast %struct.A* %a to void (%struct.A*)***
+  %6 = load i8, i8* @unknownPtr, align 4
+  %7 = icmp sgt i8 %6, 1
+  br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
+
+._crit_edge.preheader:                            ; preds = %.lr.ph.i
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.preheader, %._crit_edge
+  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
+  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
+  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
+; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  call void %9(%struct.A* nonnull %a) #3
+
+; CHECK-NOT: call void %
+  %10 = add nuw nsw i8 %8, 1
+  %11 = load i8, i8* @unknownPtr, align 4
+  %12 = icmp slt i8 %10, %11
+  br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
+
+_Z2g2R1A.exit.loopexit:                           ; preds = %._crit_edge
+  br label %_Z2g2R1A.exit
+
+_Z2g2R1A.exit:                                    ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
+  ret void
+}
+
 
 declare void @foo(i8*)
 declare void @foo2(i8*, i8)