Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -564,10 +564,10 @@
   /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// If the loop has multiple stores to an invariant address, then
+  /// If the loop has non-vectorizable stores to an invariant address, then
   /// return true, else return false.
-  bool hasMultipleStoresToLoopInvariantAddress() const {
-    return HasMultipleStoresToLoopInvariantAddress;
+  bool hasNonVectorizableStoresToLoopInvariantAddress() const {
+    return HasNonVectorizableStoresToLoopInvariantAddress;
   }
 
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
@@ -620,8 +620,8 @@
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
 
-  /// Indicator that there are multiple stores to a uniform address.
-  bool HasMultipleStoresToLoopInvariantAddress;
+  /// Indicator that there are non vectorizable stores to a uniform address.
+  bool HasNonVectorizableStoresToLoopInvariantAddress;
 
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -1865,13 +1865,19 @@
   // Record uniform store addresses to identify if we have multiple stores
   // to the same address.
   ValueSet UniformStores;
+  // Record the uniform store along with the uniform address. Since we allow
+  // only one store to the same address, we do not care if we see multiple
+  // stores to the same address when populating the map.
+  DenseMap<Value*, StoreInst*> UniformStoreMap;
 
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
 
-    if (isUniform(Ptr))
-      HasMultipleStoresToLoopInvariantAddress |=
+    if (isUniform(Ptr)) {
+      HasNonVectorizableStoresToLoopInvariantAddress |=
           !UniformStores.insert(Ptr).second;
+      UniformStoreMap[Ptr] = ST;
+    }
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
@@ -1914,6 +1920,16 @@
       IsReadOnlyPtr = true;
     }
 
+    // See if there is an unsafe dependency between a load to a uniform address and
+    // store to the same uniform address.
+    if (!HasNonVectorizableStoresToLoopInvariantAddress &&
+        UniformStores.find(Ptr) != UniformStores.end() &&
+        !DT->dominates(UniformStoreMap[Ptr], LD)) {
+      LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
+                           "load and uniform store to the same address!\n");
+      HasNonVectorizableStoresToLoopInvariantAddress = true;
+    }
+
     MemoryLocation Loc = MemoryLocation::get(LD);
     // The TBAA metadata could have a control dependency on the predication
     // condition, so we cannot rely on it when determining whether or not we
@@ -2272,7 +2288,7 @@
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      HasMultipleStoresToLoopInvariantAddress(false) {
+      HasNonVectorizableStoresToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
@@ -2304,8 +2320,8 @@
   PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
-  OS.indent(Depth) << "Multiple stores to invariant address were "
-                   << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ")
+  OS.indent(Depth) << "Non vectorizable stores to invariant address were "
+                   << (HasNonVectorizableStoresToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -814,18 +814,18 @@
                                         "loop not vectorized: ", *LAR);
     });
   }
-  if (!LAI->canVectorizeMemory())
+  if (!LAI->canVectorizeMemory()) {
     return false;
+  }
 
-  if (LAI->hasMultipleStoresToLoopInvariantAddress()) {
+  if (LAI->hasNonVectorizableStoresToLoopInvariantAddress()) {
     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
-              << "multiple writes to a loop invariant address could not "
+              << "write to a loop invariant address could not "
                  "be vectorized");
     LLVM_DEBUG(
-        dbgs() << "LV: We don't allow multiple stores to a uniform address\n");
+        dbgs() << "LV: Non vectorizable stores to a uniform address\n");
     return false;
   }
-
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
   PSE.addPredicate(LAI->getPSE().getUnionPredicate());
 
Index: test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
===================================================================
--- test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -39,7 +39,7 @@
 ; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
 ; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
-; CHECK:         Multiple stores to invariant address were not found in loop.
+; CHECK:         Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
 ; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
===================================================================
--- test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
+++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
@@ -14,14 +14,14 @@
 ; The LAA with the new PM is a loop pass so we go from inner to outer loops.
 
 ; OLDPM: for.cond1.preheader:
-; OLDPM:   Multiple stores to invariant address were not found in loop.
+; OLDPM:   Non vectorizable stores to invariant address were not found in loop.
 ; OLDPM: for.body3:
-; OLDPM:   Multiple stores to invariant address were found in loop.
+; OLDPM:   Non vectorizable stores to invariant address were found in loop.
 
 ; NEWPM: for.body3:
-; NEWPM:   Multiple stores to invariant address were found in loop.
+; NEWPM:   Non vectorizable stores to invariant address were found in loop.
 ; NEWPM: for.cond1.preheader:
-; NEWPM:   Multiple stores to invariant address were not found in loop.
+; NEWPM:   Non vectorizable stores to invariant address were not found in loop.
 
 define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
 entry:
Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
===================================================================
--- test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
+++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
@@ -10,8 +10,8 @@
 ;    }
 ;  }
 
-; CHECK: Multiple stores to invariant address were not found in loop.
-; CHECK-NOT: Multiple stores to invariant address were found in loop.
+; CHECK: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NOT: Non vectorizable stores to invariant address were found in loop.
 
 
 define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 {
Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
===================================================================
--- test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
+++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
@@ -10,7 +10,7 @@
 ;    }
 ;  }
 
-; CHECK: Multiple stores to invariant address were not found in loop.
+; CHECK: Non vectorizable stores to invariant address were not found in loop.
 
 define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 {
 entry:
Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll
===================================================================
--- test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -549,3 +549,117 @@
 for.end10:                                        ; preds = %for.inc8, %entry
   ret i32 undef
 }
+
+; cannot vectorize a loop with unsafe dependency between uniform load and store
+; to the same address
+define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
+; CHECK-LABEL: unsafe_dep_uniform_load_store
+; CHECK-NOT: <4 x i32>
+bb:
+  %tmp = alloca i32
+  store i32 %arg4, i32* %tmp
+  %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
+  br label %bb7
+
+bb7:
+  %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ]
+  %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ]
+  %tmp10 = load i32, i32* %tmp
+  %tmp11 = mul nsw i32 %tmp9, %tmp10
+  %tmp12 = srem i32 %tmp11, 65536
+  %tmp13 = add nsw i32 %tmp12, %tmp9
+  %tmp14 = trunc i32 %tmp13 to i16
+  %tmp15 = trunc i64 %tmp8 to i32
+  %tmp16 = add i32 %arg, %tmp15
+  %tmp17 = zext i32 %tmp16 to i64
+  %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17
+  store i16 %tmp14, i16* %tmp18, align 2
+  %tmp19 = add i32 %tmp13, %tmp9
+  %tmp20 = trunc i32 %tmp19 to i16
+  %tmp21 = and i16 %tmp20, 255
+  %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17
+  store i16 %tmp21, i16* %tmp22, align 2
+  %tmp23 = add nsw i32 %tmp9, 1
+  %tmp24 = add nuw nsw i64 %tmp8, 1
+  %tmp25 = icmp eq i64 %tmp24, %arg2
+  store i32 %tmp12, i32* %tmp
+  br i1 %tmp25, label %bb26, label %bb7
+
+bb26:
+  ret void
+}
+
+; uniform load and store to the same address, but it is a safe dependency that can be vectorized
+define void @safe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
+; CHECK-LABEL: safe_dep_uniform_load_store
+; CHECK:       vector.scevcheck:
+; CHECK:         [[TMP5:%.*]] = or i1
+; CHECK:    br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 %arg2, -4
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[CAST_CRD]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[ARG1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND9:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[VEC_IND9]], [[VEC_IND11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = srem <4 x i32> [[TMP11]], <i32 65536, i32 65536, i32 65536, i32 65536>
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[VEC_IND9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], %arg
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* %tmp6, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[TMP14]], <4 x i16>* [[TMP19]], align 2, !alias.scope !43, !noalias !46
+; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[TMP13]], [[VEC_IND9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = and <4 x i16> [[TMP21]], <i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, i16* %arg3, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16* [[TMP23]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[TMP22]], <4 x i16>* [[TMP24]], align 2, !alias.scope !46
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT10]] = add <4 x i32> [[VEC_IND9]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !48
+bb:
+  %tmp = alloca i32
+  store i32 %arg4, i32* %tmp
+  %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
+  br label %bb7
+
+bb7:
+  %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ]
+  %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ]
+  %sttrunc = trunc i64 %tmp8 to i32 
+  store i32 %sttrunc, i32* %tmp
+  %tmp10 = load i32, i32* %tmp
+  %tmp11 = mul nsw i32 %tmp9, %tmp10
+  %tmp12 = srem i32 %tmp11, 65536
+  %tmp13 = add nsw i32 %tmp12, %tmp9
+  %tmp14 = trunc i32 %tmp13 to i16
+  %tmp15 = trunc i64 %tmp8 to i32
+  %tmp16 = add i32 %arg, %tmp15
+  %tmp17 = zext i32 %tmp16 to i64
+  %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17
+  store i16 %tmp14, i16* %tmp18, align 2
+  %tmp19 = add i32 %tmp13, %tmp9
+  %tmp20 = trunc i32 %tmp19 to i16
+  %tmp21 = and i16 %tmp20, 255
+  %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17
+  store i16 %tmp21, i16* %tmp22, align 2
+  %tmp23 = add nsw i32 %tmp9, 1
+  %tmp24 = add nuw nsw i64 %tmp8, 1
+  %tmp25 = icmp eq i64 %tmp24, %arg2
+  br i1 %tmp25, label %bb26, label %bb7
+
+bb26:
+  ret void
+}