Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -571,6 +571,12 @@
     return StoreToLoopInvariantAddress;
   }
 
+  /// Checks existence of stores to invariant address inside loop.
+  /// If such stores exist, checks if those are non-vectorizable stores.
+  bool hasNonVectorizableStoreToLoopInvariantAddress() const {
+    return NonVectorizableStoreToLoopInvariantAddress;
+  }
+
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
   /// them to a more usable form.  All SCEV expressions during the analysis
   /// should be re-written (and therefore simplified) according to PSE.
@@ -625,6 +631,11 @@
   /// If a loop has write to a loop invariant address then it should be true.
   bool StoreToLoopInvariantAddress;
 
+  /// Indicator that there is a store to uniform address that is
+  /// non-vectorizable.
+  /// These are a subset of stores identified through
+  /// StoreToLoopInvariantAddress.
+  bool NonVectorizableStoreToLoopInvariantAddress;
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   std::unique_ptr<OptimizationRemarkAnalysis> Report;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -1862,10 +1862,26 @@
   // writes and between reads and writes, but not between reads and reads.
   ValueSet Seen;
 
+  auto isLoopInvariant = [this](StoreInst *ST) {
+    auto StoreVal = ST->getValueOperand();
+    if (TheLoop->isLoopInvariant(StoreVal))
+      return true;
+    if (!isa<Instruction>(StoreVal))
+      return false;
+    return TheLoop->hasLoopInvariantOperands(cast<Instruction>(StoreVal));
+  };
+
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
+    bool isUniformPtr = isUniform(Ptr);
     // Check for store to loop invariant address.
-    StoreToLoopInvariantAddress |= isUniform(Ptr);
+    StoreToLoopInvariantAddress |= isUniformPtr;
+
+    // Loop invariant values stored into loop invariant addresses are
+    // vectorizable.
+    NonVectorizableStoreToLoopInvariantAddress |=
+        (isUniformPtr && !isLoopInvariant(ST));
+
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr).second) {
@@ -2265,7 +2281,7 @@
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      StoreToLoopInvariantAddress(false) {
+      StoreToLoopInvariantAddress(false), NonVectorizableStoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -755,7 +755,7 @@
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasStoreToLoopInvariantAddress()) {
+  if (LAI->hasNonVectorizableStoreToLoopInvariantAddress()) {
     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
               << "write to a loop invariant address could not be vectorized");
     LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1482,8 +1482,10 @@
   /// memory access.
   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
 
-  /// The cost calculation for Load instruction \p I with uniform pointer -
-  /// scalar load + broadcast.
+  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+  /// Load: scalar load + broadcast.
+  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+  /// element)
   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
 
   /// Returns whether the instruction is a load or store and will be a emitted
@@ -5749,6 +5751,7 @@
 
 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                          unsigned VF) {
+  if (isa<LoadInst>(I)) {
   LoadInst *LI = cast<LoadInst>(I);
   Type *ValTy = LI->getType();
   Type *VectorTy = ToVectorTy(ValTy, VF);
@@ -5758,6 +5761,20 @@
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
          TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+  }
+  StoreInst *SI = cast<StoreInst>(I);
+  Type *ValTy = SI->getType();
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = SI->getAlignment();
+  unsigned AS = SI->getPointerAddressSpace();
+
+  bool isLoopInvariantValueStored =
+      TheLoop->isLoopInvariant(SI->getValueOperand());
+  return TTI.getAddressComputationCost(ValTy) +
+         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
+         (isLoopInvariantValueStored ? 0 : TTI.getVectorInstrCost(
+                                               Instruction::ExtractElement,
+                                               VectorTy, VF - 1));
 }
 
 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
@@ -5855,8 +5872,10 @@
 
       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
         NumPredStores++;
-      if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
-        // Scalar load + broadcast
+
+      if ((isa<LoadInst>(&I) || isa<StoreInst>(&I)) && Legal->isUniform(Ptr)) {
+        // Load: Scalar load + broadcast
+        // Store: Scalar store + isLoopInvariantValueStored ? 0 : extract
         unsigned Cost = getUniformMemOpCost(&I, VF);
         setWideningDecision(&I, VF, CM_Scalarize, Cost);
         continue;
Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -0,0 +1,277 @@
+; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
+; not hoist/sink the invariant stores. Even if that changes, we should still
+; vectorize this loop in case licm is not run.
+
+; The next licm pass after vectorization is to hoist/sink loop invariant
+; instructions.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; all tests that check whether it is legal to vectorize the stores to invariant
+; address.
+
+
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
+; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
+; CHECK: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <4 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <4 x i32>
+; CHECK:         [[ADD]] = add <4 x i32> %vec.phi, %wide.load
+; CHECK-NEXT:    store i32 %ntrunc, i32* %a
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <4 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: inv_val_store_to_inv_address(
+; CHECK-LABEL: vector.body:
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK:         store <4 x i32>
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, i32* %tmp1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+; Both of these tests below are handled as predicated stores and have the cost model
+; as identifying these as predicated stores.
+
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+; TODO: We can be better with the code gen for the first test and we can have
+; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
+
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
+; CHECK-LABEL: vector.body:
+; CHECK:           %wide.load = load <4 x i32>, <4 x i32>*
+; CHECK:           [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
+; CHECK:           store <4 x i32>
+; CHECK-NEXT:      [[EE:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 0
+; CHECK-NEXT:      br i1 [[EE]], label %pred.store.if, label %pred.store.continue
+
+; CHECK-LABEL: pred.store.if:
+; CHECK-NEXT:      store i32 %ntrunc, i32* %a
+; CHECK-NEXT:      br label %pred.store.continue
+
+; CHECK-LABEL: pred.store.continue:
+; CHECK-NEXT:      [[EE1:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 1
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; if (b[i] == k)
+;    a = ntrunc
+; else a = k;
+; For this case, we still vectorize, by generating predicated stores for the if
+; and else cases.
+; TODO: Code gen can be improved by select(extract(vec_cmp(b[i], k), VF - 1) == 1, a = ntrunc, a = k)
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
+; CHECK-LABEL: vector.body:
+; CHECK:           %wide.load = load <4 x i32>, <4 x i32>*
+; CHECK:           [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
+; CHECK:           store <4 x i32>
+; CHECK:           [[CMPNOT:%[a-zA-Z0-9.]+]] = xor <4 x i1> [[CMP]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK:           [[EENOT1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMPNOT]], i32 0
+; CHECK:           br i1 [[EENOT1]], label %pred.store.if, label %pred.store.continue
+
+; CHECK-LABEL: pred.store.if:
+; CHECK:          store i32 %k, i32* %a
+; CHECK:          br label %pred.store.continue
+
+; all predicated stores for a = k
+; then we check the original condition and do a predicated stores for a = ntrunc.
+
+; CHECK-LABEL: pred.store.continue14:
+; CHECK:         [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0
+define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+cond_store_k:
+  store i32 %k, i32 * %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Instcombine'd version of above test. Now the store is no longer predicated.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
+; CHECK-LABEL: vector.memcheck:
+; CHECK-LABEL: vector.ph:
+; CHECK:          [[BSPLATIN1:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %k, i32 0
+; CHECK:          [[BSPLATK:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:          [[BSPLATIN2:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %ntrunc, i32 0
+; CHECK:          [[BSPLAT2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:     br label %vector.body
+
+; CHECK-LABEL: vector.body:
+; CHECK:           [[GEPB:%[a-zA-Z0-9.]+]] = getelementptr inbounds i32, i32* %b, i64 %index
+; CHECK-NEXT:      [[BCB:%[a-zA-Z0-9.]+]] = bitcast i32* [[GEPB]] to <4 x i32>*
+; CHECK-NEXT:      %wide.load = load <4 x i32>, <4 x i32>* [[BCB]]
+; CHECK-NEXT:      [[INVCOND:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, [[BSPLATK]]
+; CHECK:           %predphi = select <4 x i1> [[INVCOND]], <4 x i32> [[BSPLAT2]], <4 x i32> [[BSPLATK]]
+; CHECK-NEXT:      [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %predphi, i32 3
+; CHECK-NEXT:      store i32 [[EE]], i32* %a
+; CHECK-NEXT:      %index.next = add i64 %index, 4
+define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; invariant val stored to invariant address predicated on invariant condition
+; This is not treated as a predicated store since the block the store belongs to
+; is the latch block (which doesn't need to be predicated).
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
+; CHECK-LABEL: vector.memcheck:
+; CHECK-LABEL: vector.ph:
+; CHECK:          [[BSPLATIN1:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %ntrunc, i32 0
+; CHECK:          [[BSPLATN:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:          [[INSK:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %k, i32 3
+; CHECK:          %predphi = select <4 x i1> {{.*}}, <4 x i32> [[INSK]], <4 x i32> [[BSPLATN]]
+; CHECK:          [[STOREVAL:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %predphi, i32 3
+; CHECK-NEXT:     br label %vector.body
+
+; CHECK-LABEL:  vector.body:
+; CHECK:           [[GEPB:%[a-zA-Z0-9.]+]] = getelementptr inbounds i32, i32* %b, i64 %index
+; CHECK-NEXT:      [[BCB:%[a-zA-Z0-9.]+]] = bitcast i32* [[GEPB]] to <4 x i32>*
+; CHECK:           store i32 [[STOREVAL]], i32* %a
+; CHECK-NEXT:      %index.next = add i64 %index, 4
+define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
Index: test/Transforms/LoopVectorize/pr31190.ll
===================================================================
--- test/Transforms/LoopVectorize/pr31190.ll
+++ test/Transforms/LoopVectorize/pr31190.ll
@@ -29,7 +29,8 @@
 @a = external global i32, align 4
 @b = external global [1 x i32], align 4
 
-; CHECK: LV: Not vectorizing: Cannot prove legality.
+; We can vectorize this loop because we are storing an invariant value into an
+; invariant address.
 ; CHECK-LABEL: @test
 define void @test() {
 entry: