Index: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -564,11 +564,10 @@
   /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// Checks existence of store to invariant address inside loop.
-  /// If the loop has any store to invariant address, then it returns true,
-  /// else returns false.
-  bool hasStoreToLoopInvariantAddress() const {
-    return StoreToLoopInvariantAddress;
+  /// If the loop has any store of a variant value to an invariant address, then
+  /// return true, else return false.
+  bool hasVariantStoreToLoopInvariantAddress() const {
+    return HasVariantStoreToLoopInvariantAddress;
   }
 
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
@@ -621,9 +620,8 @@
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
 
-  /// Indicator for storing to uniform addresses.
-  /// If a loop has write to a loop invariant address then it should be true.
-  bool StoreToLoopInvariantAddress;
+  /// Indicator that there is a store of a variant value to a uniform address.
+  bool HasVariantStoreToLoopInvariantAddress;
 
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
Index: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1862,10 +1862,21 @@
   // writes and between reads and writes, but not between reads and reads.
   ValueSet Seen;
 
+  // Record uniform store addresses to identify if we have multiple stores
+  // to the same address.
+  ValueSet UniformStores;
+
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
-    // Check for store to loop invariant address.
-    StoreToLoopInvariantAddress |= isUniform(Ptr);
+
+    if (isUniform(Ptr)) {
+      // Consider multiple stores to the same uniform address as a store of a
+      // variant value.
+      bool MultipleStoresToUniformPtr = !UniformStores.insert(Ptr).second;
+      HasVariantStoreToLoopInvariantAddress |=
+          (!isUniform(ST->getValueOperand()) || MultipleStoresToUniformPtr);
+    }
+
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr).second) {
@@ -2265,7 +2276,7 @@
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      StoreToLoopInvariantAddress(false) {
+      HasVariantStoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
@@ -2297,8 +2308,8 @@
   PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
-  OS.indent(Depth) << "Store to invariant address was "
-                   << (StoreToLoopInvariantAddress ? "" : "not ")
+  OS.indent(Depth) << "Variant Store to invariant address was "
+                   << (HasVariantStoreToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -817,9 +817,10 @@
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasStoreToLoopInvariantAddress()) {
+  if (LAI->hasVariantStoreToLoopInvariantAddress()) {
     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
-              << "write to a loop invariant address could not be vectorized");
+              << "write of variant value to a loop invariant address could not "
+                 "be vectorized");
     LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
     return false;
   }
Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1174,8 +1174,11 @@
   /// memory access.
   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
 
-  /// The cost calculation for Load instruction \p I with uniform pointer -
-  /// scalar load + broadcast.
+  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+  /// Load: scalar load + broadcast.
+  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+  /// element)
+  /// TODO: Test the extra cost of the extract when loop variant value stored.
   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
 
   /// Returns whether the instruction is a load or store and will be a emitted
@@ -5297,15 +5300,23 @@
 
 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                          unsigned VF) {
-  LoadInst *LI = cast<LoadInst>(I);
-  Type *ValTy = LI->getType();
+  Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = LI->getAlignment();
-  unsigned AS = LI->getPointerAddressSpace();
+  unsigned Alignment = getLoadStoreAlignment(I);
+  unsigned AS = getLoadStoreAddressSpace(I);
+  if (isa<LoadInst>(I)) {
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+  }
+  StoreInst *SI = cast<StoreInst>(I);
 
+  bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
   return TTI.getAddressComputationCost(ValTy) +
-         TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
-         TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
+         (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
+                                               Instruction::ExtractElement,
+                                               VectorTy, VF - 1));
 }
 
 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
@@ -5404,15 +5415,22 @@
       if (!Ptr)
         continue;
 
+      // TODO: We should generate better code and update the cost model for
+      // predicated uniform stores. Today they are treated as any other
+      // predicated store (see added test cases in
+      // invariant-store-vectorization.ll).
       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
         NumPredStores++;
 
-      if (isa<LoadInst>(&I) && Legal->isUniform(Ptr) &&
-          // Conditional loads should be scalarized and predicated.
+      if (Legal->isUniform(Ptr) &&
+          // Conditional loads and stores should be scalarized and predicated.
           // isScalarWithPredication cannot be used here since masked
           // gather/scatters are not considered scalar with predication.
           !Legal->blockNeedsPredication(I.getParent())) {
-        // Scalar load + broadcast
+        // TODO: Avoid replicating loads and stores instead of
+        // relying on instcombine to remove them.
+        // Load: Scalar load + broadcast
+        // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         unsigned Cost = getUniformMemOpCost(&I, VF);
         setWideningDecision(&I, VF, CM_Scalarize, Cost);
         continue;
Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
===================================================================
--- llvm/trunk/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ llvm/trunk/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -39,7 +39,7 @@
 ; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
 ; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
-; CHECK:         Store to invariant address was not found in loop.
+; CHECK:         Variant Store to invariant address was not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
 ; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
===================================================================
--- llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
+++ llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
@@ -13,14 +13,14 @@
 ; The LAA with the new PM is a loop pass so we go from inner to outer loops.
 
 ; OLDPM: for.cond1.preheader:
-; OLDPM:   Store to invariant address was not found in loop.
+; OLDPM:   Variant Store to invariant address was not found in loop.
 ; OLDPM: for.body3:
-; OLDPM:   Store to invariant address was found in loop.
+; OLDPM:   Variant Store to invariant address was found in loop.
 
 ; NEWPM: for.body3:
-; NEWPM:   Store to invariant address was found in loop.
+; NEWPM:   Variant Store to invariant address was found in loop.
 ; NEWPM: for.cond1.preheader:
-; NEWPM:   Store to invariant address was not found in loop.
+; NEWPM:   Variant Store to invariant address was not found in loop.
 
 define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
 entry:
Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
===================================================================
--- llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
+++ llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
@@ -10,8 +10,8 @@
 ;    }
 ;  }
 
-; CHECK: Store to invariant address was not found in loop.
-; CHECK-NOT: Store to invariant address was found in loop.
+; CHECK: Variant Store to invariant address was not found in loop.
+; CHECK-NOT: Variant Store to invariant address was found in loop.
 
 
 define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 {
Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
===================================================================
--- llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
+++ llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
@@ -10,7 +10,7 @@
 ;    }
 ;  }
 
-; CHECK: Store to invariant address was found in loop.
+; CHECK: Variant Store to invariant address was found in loop.
 
 define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 {
 entry:
Index: llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; first test checks that loop with a reduction and a uniform store gets
+; vectorized.
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
+; CHECK-LABEL: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <16 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <16 x i32>
+; CHECK:         [[ADD]] = add <16 x i32> %vec.phi, %wide.load
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK-NOT:     store i32 %ntrunc, i32* %a
+; CHECK:         %index.next = add i64 %index, 64
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <16 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
Index: llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -0,0 +1,260 @@
+; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
+; not hoist/sink the invariant stores. Even if that changes, we should still
+; vectorize this loop in case licm is not run.
+
+; The next licm pass after vectorization is to hoist/sink loop invariant
+; instructions.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; all tests check that it is legal to vectorize the stores to invariant
+; address.
+
+
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
+; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
+; CHECK: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <4 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <4 x i32>
+; CHECK:         [[ADD]] = add <4 x i32> %vec.phi, %wide.load
+; CHECK-NEXT:    store i32 %ntrunc, i32* %a
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <4 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: inv_val_store_to_inv_address(
+; CHECK-LABEL: vector.body:
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK:         store <4 x i32>
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, i32* %tmp1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+; Both of these tests below are handled as predicated stores.
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+; TODO: We can be better with the code gen for the first test and we can have
+; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
+
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
+; CHECK-LABEL: vector.body:
+; CHECK:           %wide.load = load <4 x i32>, <4 x i32>*
+; CHECK:           [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
+; CHECK:           store <4 x i32>
+; CHECK-NEXT:      [[EE:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 0
+; CHECK-NEXT:      br i1 [[EE]], label %pred.store.if, label %pred.store.continue
+
+; CHECK-LABEL: pred.store.if:
+; CHECK-NEXT:      store i32 %ntrunc, i32* %a
+; CHECK-NEXT:      br label %pred.store.continue
+
+; CHECK-LABEL: pred.store.continue:
+; CHECK-NEXT:      [[EE1:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 1
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; if (b[i] == k)
+;    a = ntrunc
+; else a = k;
+; TODO: We could vectorize this once we support multiple uniform stores to the
+; same address.
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
+; CHECK-NOT:           load <4 x i32>
+define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+cond_store_k:
+  store i32 %k, i32 * %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Instcombine'd version of above test. Now the store is no longer of invariant
+; value.
+; TODO: We should be able to vectorize this loop once we support vectorizing
+; stores of variant values to invariant addresses.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
+; CHECK-NOT:   <4 x
+define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; invariant val stored to invariant address predicated on invariant condition
+; This is not treated as a predicated store since the block the store belongs to
+; is the latch block (which doesn't need to be predicated).
+; TODO: We should vectorize this loop once we relax the check for
+; variant/invariant values being stored to invariant address.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
+; CHECK-NOT: <4 x
+define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; TODO: This loop can be vectorized once we support variant value being
+; stored into invariant address.
+; CHECK-LABEL: variant_val_store_to_inv_address
+; CHECK-NOT: <4 x i32>
+define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %tmp2, i32* %a
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %rdx.lcssa = phi i32 [ %tmp0, %for.body ]
+  ret i32 %rdx.lcssa
+}
Index: llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll
@@ -29,7 +29,10 @@
 @a = external global i32, align 4
 @b = external global [1 x i32], align 4
 
-; CHECK: LV: Not vectorizing: Cannot prove legality.
+; We can vectorize this loop because we are storing an invariant value into an
+; invariant address.
+
+; CHECK: LV: We can vectorize this loop!
 ; CHECK-LABEL: @test
 define void @test() {
 entry: