diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -32,6 +32,7 @@
 class PredicatedScalarEvolution;
 class ScalarEvolution;
 class SCEV;
+class StoreInst;
 class DominatorTree;
 
 /// These are the kinds of recurrences that we support.
@@ -72,13 +73,13 @@
 public:
   RecurrenceDescriptor() = default;
 
-  RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
-                       FastMathFlags FMF, Instruction *ExactFP, Type *RT,
-                       bool Signed, bool Ordered,
+  RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store,
+                       RecurKind K, FastMathFlags FMF, Instruction *ExactFP,
+                       Type *RT, bool Signed, bool Ordered,
                        SmallPtrSetImpl<Instruction *> &CI)
-      : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
-        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
-        IsOrdered(Ordered) {
+      : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit),
+        Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT),
+        IsSigned(Signed), IsOrdered(Ordered) {
     CastInsts.insert(CI.begin(), CI.end());
   }
 
@@ -165,7 +166,7 @@
   /// non-null, the minimal bit width needed to compute the reduction will be
   /// computed.
   static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
-                              FastMathFlags FuncFMF,
+                              FastMathFlags FuncFMF, ScalarEvolution *SE,
                               RecurrenceDescriptor &RedDes,
                               DemandedBits *DB = nullptr,
                               AssumptionCache *AC = nullptr,
@@ -175,7 +176,7 @@
   /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are
   /// non-null, the minimal bit width needed to compute the reduction will be
   /// computed.
-  static bool isReductionPHI(PHINode *Phi, Loop *TheLoop,
+  static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, ScalarEvolution *SE,
                              RecurrenceDescriptor &RedDes,
                              DemandedBits *DB = nullptr,
                              AssumptionCache *AC = nullptr,
@@ -260,6 +261,9 @@
   SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
                                                     Loop *L) const;
 
+  // Intermediate store of the reduction
+  StoreInst *IntermediateStore = nullptr;
+
 private:
   // The starting value of the recurrence.
   // It does not have to be zero!
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -579,6 +579,11 @@
     return HasDependenceInvolvingLoopInvariantAddress;
   }
 
+  /// Return the list of stores to invariant addresses.
+  const SmallVectorImpl<StoreInst *> &getStoresToInvariantAddresses() const {
+    return StoresToInvariantAddresses;
+  }
+
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
   /// them to a more usable form.  All SCEV expressions during the analysis
   /// should be re-written (and therefore simplified) according to PSE.
@@ -633,6 +638,9 @@
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress;
 
+  /// List of stores to invariant addresses.
+  SmallVector<StoreInst *, 5> StoresToInvariantAddresses;
+
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   std::unique_ptr<OptimizationRemarkAnalysis> Report;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1116,6 +1116,9 @@
   /// specified loop.
   bool isLoopInvariant(const SCEV *S, const Loop *L);
 
+  /// Returns true if A and B have same pointer operands or same SCEVs addresses
+  bool storeToSameAddress(StoreInst *A, StoreInst *B);
+
   /// Determine if the SCEV can be evaluated at loop's entry. It is true if it
   /// doesn't depend on a SCEVUnknown of an instruction which is dominated by
   /// the header of loop L.
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -307,6 +307,10 @@
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
+  /// Returns True if given invariant store uses recurrent expression
+  /// \p IsPredicated shows if block containing \p SI needs predication
+  bool isRecurringInvariantStore(StoreInst *SI, bool *IsPredicated);
+
   /// Returns True if V is a Phi node of an induction variable in this loop.
   bool isInductionPhi(const Value *V);
 
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -214,12 +214,10 @@
   return true;
 }
 
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
-                                           Loop *TheLoop, FastMathFlags FuncFMF,
-                                           RecurrenceDescriptor &RedDes,
-                                           DemandedBits *DB,
-                                           AssumptionCache *AC,
-                                           DominatorTree *DT) {
+bool RecurrenceDescriptor::AddReductionVar(
+    PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
+    ScalarEvolution *SE, RecurrenceDescriptor &RedDes, DemandedBits *DB,
+    AssumptionCache *AC, DominatorTree *DT) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -236,6 +234,8 @@
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = nullptr;
+  StoreInst *IntermediateStore = nullptr;
+
   // Indicates that we found a reduction operation in our scan.
   bool FoundReduxOp = false;
 
@@ -296,6 +296,10 @@
   //  - By instructions outside of the loop (safe).
   //      * One value may have several outside users, but all outside
   //        uses must be of the same value.
+  //  - By store instructions with a loop invariant address (safe with
+  //    the following restrictions):
+  //      * If there are several stores, all must have the same address.
+  //      * Final value should be stored in that loop invariant address.
   //  - By an instruction that is not part of the reduction (not safe).
   //    This is either:
   //      * An instruction type other than PHI or the reduction operation.
@@ -303,6 +307,37 @@
   while (!Worklist.empty()) {
     Instruction *Cur = Worklist.pop_back_val();
 
+    // Store instructions are allowed iff it is the store of the reduction
+    // value to the same loop invariant memory location.
+    if (auto *SI = dyn_cast<StoreInst>(Cur)) {
+      const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand());
+      // Check it is the same address as previous stores
+      if (IntermediateStore) {
+        const SCEV *OtherScev =
+            SE->getSCEV(IntermediateStore->getPointerOperand());
+
+        if (OtherScev != PtrScev) {
+          LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses "
+                            << "inside the loop: " << *SI->getPointerOperand()
+                            << " and "
+                            << *IntermediateStore->getPointerOperand() << '\n');
+          return false;
+        }
+      }
+
+      // Check the pointer is loop invariant
+      if (!SE->isLoopInvariant(PtrScev, TheLoop)) {
+        LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address "
+                          << "inside the loop: " << *SI->getPointerOperand()
+                          << '\n');
+        return false;
+      }
+
+      // IntermediateStore is always the last store in the loop.
+      IntermediateStore = SI;
+      continue;
+    }
+
     // No Users.
     // If the instruction has no users then this is a broken chain and can't be
     // a reduction variable.
@@ -418,8 +453,17 @@
       if (VisitedInsts.insert(UI).second) {
         if (isa<PHINode>(UI))
           PHIs.push_back(UI);
-        else
-          NonPHIs.push_back(UI);
+        else {
+          if (auto *SI = dyn_cast<StoreInst>(UI)) {
+            // Reduction variable chain can only be stored somewhere but it
+            // can't be used as an address.
+            if (SI->getValueOperand() == Cur)
+              NonPHIs.push_back(UI);
+            else
+              return false;
+          } else
+            NonPHIs.push_back(UI);
+        }
       } else if (!isa<PHINode>(UI) &&
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
@@ -447,6 +491,21 @@
   if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
     return false;
 
+  // If there is an intermediate store, it must store the last reduction value.
+  if (ExitInstruction != nullptr && IntermediateStore) {
+    if (IntermediateStore->getValueOperand() != ExitInstruction) {
+      LLVM_DEBUG(
+          dbgs() << "LU: Last store Instruction of reduction value "
+                 << "does not store last calculated value of the reduction: "
+                 << *IntermediateStore << '\n');
+      return false;
+    }
+  }
+  // If all uses are inside the loop (intermediate stores), then the
+  // reduction value after the loop will be the one used in the last store.
+  if (!ExitInstruction && IntermediateStore)
+    ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand());
+
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
@@ -505,8 +564,8 @@
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
-                          ReduxDesc.getExactFPMathInst(), RecurrenceType,
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind,
+                          FMF, ReduxDesc.getExactFPMathInst(), RecurrenceType,
                           IsSigned, IsOrdered, CastInsts);
   RedDes = RD;
 
@@ -726,6 +785,7 @@
 }
 
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
+                                          ScalarEvolution *SE,
                                           RecurrenceDescriptor &RedDes,
                                           DemandedBits *DB, AssumptionCache *AC,
                                           DominatorTree *DT) {
@@ -737,66 +797,79 @@
   FMF.setNoSignedZeros(
       F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool());
 
-  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+  if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, SE, RedDes, DB,
+                      AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
                       << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, SE, RedDes, DB, AC,
+                      DT)) {
     LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+  if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, SE, RedDes, DB,
+                      AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
                       << " PHI." << *Phi << "\n");
     return true;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1987,9 +1987,12 @@
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
 
-    if (isUniform(Ptr))
+    if (isUniform(Ptr)) {
+      // Record store instructions to loop invariant addresses
+      StoresToInvariantAddresses.push_back(ST);
       HasDependenceInvolvingLoopInvariantAddress |=
           !UniformStores.insert(Ptr).second;
+    }
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -12825,6 +12825,24 @@
   return getLoopDisposition(S, L) == LoopInvariant;
 }
 
+bool ScalarEvolution::storeToSameAddress(StoreInst *A, StoreInst *B) {
+  // Compare store
+  if (A == B)
+    return true;
+
+  // Otherwise Compare pointers
+  Value *APtr = A->getPointerOperand();
+  Value *BPtr = B->getPointerOperand();
+  if (APtr == BPtr)
+    return true;
+
+  // Otherwise compare address SCEVs
+  if (getSCEV(APtr) == getSCEV(BPtr))
+    return true;
+
+  return false;
+}
+
 bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) {
   return getLoopDisposition(S, L) == LoopComputable;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -731,7 +731,7 @@
 }
 
 // Check V's users to see if it is involved in a reduction in L.
-static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+static PHINode *findInnerReductionPhi(Loop *L, Value *V, ScalarEvolution *SE) {
   // Reduction variables cannot be constants.
   if (isa<Constant>(V))
     return nullptr;
@@ -741,7 +741,7 @@
       if (PHI->getNumIncomingValues() == 1)
         continue;
       RecurrenceDescriptor RD;
-      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+      if (RecurrenceDescriptor::isReductionPHI(PHI, L, SE, RD))
         return PHI;
       return nullptr;
     }
@@ -774,7 +774,7 @@
         // Check if we have a PHI node in the outer loop that has a reduction
         // result from the inner loop as an incoming value.
         Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
-        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
+        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V, SE);
         if (!InnerRedPhi ||
             !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
           LLVM_DEBUG(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -655,8 +655,8 @@
         }
 
         RecurrenceDescriptor RedDes;
-        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
-                                                 DT)) {
+        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, PSE.getSE(),
+                                                 RedDes, DB, AC, DT)) {
           Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
           AllowedExit.insert(RedDes.getLoopExitInstr());
           Reductions[Phi] = RedDes;
@@ -892,10 +892,49 @@
     return false;
 
   if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
-    reportVectorizationFailure("Stores to a uniform address",
-        "write to a loop invariant address could not be vectorized",
-        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
-    return false;
+    ScalarEvolution *SE = PSE.getSE();
+    SmallVector<StoreInst *, 4> UnhandledStores;
+    bool IsPredicated;
+    // For each invariant address, check its last stored value is the result
+    // of one of our reductions and is unconditional.
+    for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+      if (isRecurringInvariantStore(SI, &IsPredicated)) {
+        if (IsPredicated)
+          UnhandledStores.push_back(SI);
+        else {
+          // Earlier stores to this address are effectively deadcode.
+          llvm::erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+            return SE->storeToSameAddress(SI, I);
+          });
+        }
+      } else if (!isUniform(SI->getValueOperand()))
+        UnhandledStores.push_back(SI);
+    }
+
+    bool IsOK = UnhandledStores.empty();
+    // TODO: we should also validate against InvariantMemSets.
+    if (!IsOK) {
+      reportVectorizationFailure("We don't allow storing to uniform addresses",
+                                 "write to a loop invariant address could not "
+                                 "be vectorized",
+                                 "CantVectorizeStoreToLoopInvariantAddress",
+                                 ORE, TheLoop);
+      return false;
+    }
+  } else if (!LAI->getStoresToInvariantAddresses().empty()) {
+    bool IsPredicated;
+    // For each invariant address, check its last stored value is the result
+    // of one of our reductions and is unconditional.
+    for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+      if (isRecurringInvariantStore(SI, &IsPredicated) && IsPredicated) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write of conditional recurring variant value to a loop "
+            "invariant address could not be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
@@ -922,13 +961,29 @@
 
   // We can now only vectorize if all reductions with Exact FP math also
   // have the isOrdered flag set, which indicates that we can move the
-  // reduction operations in-loop.
+  // reduction operations in-loop, and do not have intermediate store.
   return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
     const RecurrenceDescriptor &RdxDesc = Reduction.second;
-    return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
+    return !RdxDesc.hasExactFPMath() ||
+           (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore);
   }));
 }
 
+bool LoopVectorizationLegality::isRecurringInvariantStore(StoreInst *SI,
+                                                          bool *IsPredicated) {
+  bool FoundMatchingRecurrence = false;
+  for (auto &Reduction : Reductions) {
+    RecurrenceDescriptor DS = Reduction.second;
+    StoreInst *DSI = DS.IntermediateStore;
+    if (DSI && (DSI == SI)) {
+      FoundMatchingRecurrence = true;
+      *IsPredicated = blockNeedsPredication(DSI->getParent());
+      break;
+    }
+  }
+  return FoundMatchingRecurrence;
+}
+
 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3049,6 +3049,17 @@
                                                VPTransformState &State) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
 
+  // Don't create a memory instruction for an intermediate store of a
+  // reduction variable, because this will be one to a uniform address.
+  if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) {
+    for (auto &Reduction : Legal->getReductionVars()) {
+      RecurrenceDescriptor DS = Reduction.second;
+      if (DS.IntermediateStore &&
+          PSE.getSE()->storeToSameAddress(SI, DS.IntermediateStore))
+        return;
+    }
+  }
+
   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
   // the first lane and part.
   if (isa<NoAliasScopeDeclInst>(Instr))
@@ -4440,6 +4451,17 @@
     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
 
+  // If there were stores of the reduction value to a uniform memory address
+  // inside the loop, create the final store here.
+  if (StoreInst *SI = RdxDesc.IntermediateStore) {
+    StoreInst *NewSI =
+        Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+    propagateMetadata(NewSI, SI);
+
+    // If the reduction value is used in other places,
+    // then let the code below create PHI's for that.
+  }
+
   // Now, we need to fix the users of the reduction variable
   // inside and outside of the scalar remainder loop.
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -931,6 +931,37 @@
   ret double %res
 }
 
+; Test case with invariant store where fadd is strict.
+define void @reduction_store_to_invariant_address(float* %dst, float* readonly %src) {
+; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+  %arrayidx = getelementptr inbounds float, float* %dst, i64 42
+  store float 0.000000e+00, float* %arrayidx, align 4
+  br label %for.body
+
+for.body:
+  %0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds float, float* %src, i64 %indvars.iv
+  %1 = load float, float* %arrayidx1, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
 !0 = distinct !{!0, !5, !9, !11}
 !1 = distinct !{!1, !5, !10, !11}
 !2 = distinct !{!2, !6, !9, !11}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -462,6 +462,268 @@
   ret i32 %inc.2
 }
 
+; int sum = 0;
+; for(i=0..N) {
+;   sum += src[i];
+;   dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store
+
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %arrayidx
+; CHECK: middle.block:
+; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx
+define void @reduc_store(i32* %dst, i32* readonly %src) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %dst, i64 42
+  store i32 0, i32* %arrayidx, align 4
+  br label %for.body
+for.body:
+  %0 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  store i32 %add, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; float sum = 0;
+; for(i=0..N) {
+;   sum += src[i];
+;   dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store_fadd_fast
+; CHECK: vector.body:
+; CHECK-NOT: store float %{{[0-9]+}}, float* %arrayidx
+; CHECK: middle.block:
+; CHECK: store float %{{[0-9]+}}, float* %arrayidx
+define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %dst, i64 42
+  store float 0.000000e+00, float* %arrayidx, align 4
+  br label %for.body
+
+for.body:
+  %0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds float, float* %src, i64 %indvars.iv
+  %1 = load float, float* %arrayidx1, align 4
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; int sum = 0;
+; for(i=0..N) {
+;   int diff = y[i] - x[i];
+;   if (diff > 0) {
+;     sum = += diff;
+;     *t = sum;
+;   }
+; }
+; CHECK-LABEL: @reduc_cond_store
+; CHECK-NOT: vector.body
+define void @reduc_cond_store(i32* %t, i32* readonly %x, i32* readonly %y, i32 %n) {
+entry:
+  store i32 0, i32* %t, align 4
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %if.end, %for.body.preheader
+  %0 = phi i32 [ 0, %for.body.preheader ], [ %3, %if.end ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx2, align 4
+  %sub = sub nsw i32 %1, %2
+  %cmp3 = icmp sgt i32 %sub, 0
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %add = add nsw i32 %sub, %0
+  store i32 %add, i32* %t, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %3 = phi i32 [ %add, %if.then ], [ %0, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end, %entry
+  ret void
+}
+
+;  int sum = 0;
+;  for(int i=0; i < 1000; i+=2) {
+;    sum += src[i];
+;    dst[42] = sum;
+;    sum += src[i+1];
+;    dst[42] = sum;
+;  }
+; CHECK-LABEL: @reduc_store_inside_unrolled
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %arrayidx
+; CHECK: middle.block:
+; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx
+; CHECK: ret void
+define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 42
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.018 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.018
+  store i32 %add, i32* %arrayidx1, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %src, i64 %1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %2, %add
+  store i32 %add5, i32* %arrayidx1, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+;  int sum = 0;
+;  for(int i=0; i < 1000; i+=2) {
+;    sum += src[i];
+;    dst[42] = sum;
+;    sum += src[i+1];
+;    other_dst[42] = sum;
+;  }
+; CHECK-LABEL: @reduc_double_invariant_store
+; CHECK-NOT: vector.body:
+define void @reduc_double_invariant_store(i32* %dst, i32* %other_dst, i32* readonly %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 42
+  %arrayidx2 = getelementptr inbounds i32, i32* %other_dst, i64 42
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.018 = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.018
+  store i32 %add, i32* %arrayidx1, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %src, i64 %1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %2, %add
+  store i32 %add5, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+;  int sum = 0;
+;  for(int i=0; i < 1000; i+=2) {
+;    sum += src[i];
+;    dst[42] = sum;
+;    sum += src[i+1];
+;    if (src[i+1] > 0)
+;      dst[42] = sum;
+;  }
+; CHECK-LABEL: @reduc_store_final_store_predicated
+; CHECK-NOT: vector.body:
+define void @reduc_store_final_store_predicated(i32* %dst, i32* readonly %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 42
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %latch
+  ret void
+
+for.body:                                         ; preds = %latch, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %latch ]
+  %sum.018 = phi i32 [ 0, %entry ], [ %add5, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.018
+  store i32 %add, i32* %arrayidx1, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %src, i64 %1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %2, %add
+  %cmp1 = icmp sgt i32 %2, 0
+  br i1 %cmp1, label %predicated, label %latch
+
+predicated:                                       ; preds = %for.body
+  store i32 %add5, i32* %arrayidx1, align 4
+  br label %latch
+
+latch:                                            ; preds = %predicated, %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; int sum = 0;
+; for(int i=0; i < 1000; i++) {
+;   sum += src[i];
+;   dst[42] = sum;
+; }
+; dst[43] = sum;
+; CHECK-LABEL: @reduc_store_inoutside
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %arrayidx
+; CHECK: middle.block:
+; CHECK: store i32 %[[RDX:[0-9]+]], i32* %arrayidx
+; CHECK: for.cond.cleanup:
+; CHECK: %[[PHI:[a-zA-Z.0-9]+]] = phi i32 {{.*}} %[[RDX]]
+; CHECK: %[[ADDR:[a-zA-Z.0-9]+]] = getelementptr inbounds i32, i32* %dst, i64 43
+; CHECK: store i32 %[[PHI]], i32* %[[ADDR]]
+; CHECK: ret void
+define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 42
+  br label %for.body
+
+for.cond.cleanup:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 43
+  store i32 %add.lcssa, i32* %arrayidx2, align 4
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.010
+  store i32 %add, i32* %arrayidx1, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 ;CHECK-LABEL: @reduction_sum_multiuse(
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>