Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -585,7 +585,7 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
-    NumOpsWantToKeepOrder.clear();
+    NumOpsWantToKeepOrder = 0;
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -599,14 +599,7 @@
   void optimizeGatherSequence();
 
   /// \returns true if it is beneficial to reverse the vector order.
-  bool shouldReorder() const {
-    return std::accumulate(
-               NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0,
-               [](int Val1,
-                  const decltype(NumOpsWantToKeepOrder)::value_type &Val2) {
-                 return Val1 + (Val2.second < 0 ? 1 : -1);
-               }) > 0;
-  }
+  bool shouldReorder() const { return NumOpsWantToKeepOrder < 0; }
 
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
@@ -1205,7 +1198,7 @@
   /// Number of operation bundles that contain consecutive operations - number
   /// of operation bundles that contain consecutive operations in reversed
   /// order.
-  DenseMap<unsigned, int> NumOpsWantToKeepOrder;
+  int NumOpsWantToKeepOrder;
 
   // Analysis and block reference.
   Function *F;
@@ -1560,11 +1553,11 @@
       bool Reuse = canReuseExtract(VL, VL0);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        ++NumOpsWantToKeepOrder[S.Opcode];
+        ++NumOpsWantToKeepOrder;
       } else {
         SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
         if (canReuseExtract(ReverseVL, VL0))
-          --NumOpsWantToKeepOrder[S.Opcode];
+          --NumOpsWantToKeepOrder;
         BS.cancelScheduling(VL, VL0);
       }
       newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies);
@@ -1614,7 +1607,7 @@
       }
 
       if (Consecutive) {
-        ++NumOpsWantToKeepOrder[S.Opcode];
+        ++NumOpsWantToKeepOrder;
         newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
@@ -1630,7 +1623,7 @@
           }
 
       if (ReverseConsecutive) {
-        --NumOpsWantToKeepOrder[S.Opcode];
+        --NumOpsWantToKeepOrder;
         newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
         DEBUG(dbgs() << "SLP: added a vector of reversed loads.\n");
         return;
@@ -1797,22 +1790,54 @@
     }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
-          DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
-          return;
+      bool IsConsecutive = true;
+      bool IsReverseConsecutive = true;
+      for (unsigned I = 0, E = VL.size() - 1; I < E; ++I) {
+        if (isConsecutiveAccess(VL[I], VL[I + 1], *DL, *SE))
+          IsReverseConsecutive = false;
+        else {
+          IsConsecutive = false;
+          break;
         }
+      }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
-      DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+      if (IsConsecutive) {
+        ++NumOpsWantToKeepOrder;
+        newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+        DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+        ValueList Operands;
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(0));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        return;
+      }
+
+      if (IsReverseConsecutive) {
+        for (unsigned I = VL.size(); I > 1; --I) {
+          if (!isConsecutiveAccess(VL[I - 1], VL[I - 2], *DL, *SE)) {
+            IsReverseConsecutive = false;
+            break;
+          }
+        }
+        if (IsReverseConsecutive) {
+          // Reversed consecutive order.
+          --NumOpsWantToKeepOrder;
+          newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+          DEBUG(dbgs() << "SLP: added a vector of reversed stores.\n");
+
+          ValueList Operands;
+          for (Value *V : VL)
+            Operands.push_back(cast<Instruction>(V)->getOperand(0));
 
-      ValueList Operands;
-      for (Value *j : VL)
-        Operands.push_back(cast<Instruction>(j)->getOperand(0));
+          buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+          return;
+        }
+      }
 
-      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
       return;
     }
     case Instruction::Call: {
@@ -2254,7 +2279,8 @@
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
-      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
+      auto *SI = cast<StoreInst>(VL0);
+      unsigned alignment = SI->getAlignment();
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
                             TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
@@ -2264,6 +2290,10 @@
           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
                                            VecTy, alignment, 0, VL0);
+      if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) {
+        VecStCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
       return ReuseShuffleCost + VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -3245,7 +3275,9 @@
       return V;
     }
     case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(VL0);
+      bool IsConsecutive =
+          isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE);
+      StoreInst *SI = cast<StoreInst>(IsConsecutive ? VL0 : E->Scalars.back());
       unsigned Alignment = SI->getAlignment();
       unsigned AS = SI->getPointerAddressSpace();
 
@@ -3256,6 +3288,12 @@
       setInsertPointAfterBundle(E->Scalars, VL0);
 
       Value *VecValue = vectorizeTree(ScalarStoreValues);
+      if (!IsConsecutive) {
+        SmallVector<uint32_t, 4> Mask(E->Scalars.size());
+        std::iota(Mask.rbegin(), Mask.rend(), 0);
+        VecValue = Builder.CreateShuffleVector(
+            VecValue, UndefValue::get(VecValue->getType()), Mask);
+      }
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
@@ -4570,6 +4608,10 @@
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     R.buildTree(Operands);
+    if (R.shouldReorder()) {
+      SmallVector<Value *, 8> Reversed(Operands.rbegin(), Operands.rend());
+      R.buildTree(Reversed);
+    }
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
 
Index: test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
+++ test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -92,15 +92,14 @@
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry: