Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -382,7 +382,7 @@
     /// vector elements, try to merge them into one larger store.
     /// \return True if a merged store was created.
     bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
-                                         EVT MemVT, unsigned NumElem,
+                                         EVT MemVT, unsigned StoresToMerge,
                                          bool IsConstantSrc, bool UseVector);
     
     /// Merge consecutive store operations into a wide store.
@@ -9730,16 +9730,16 @@
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
                   SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
-                  unsigned NumElem, bool IsConstantSrc, bool UseVector) {
+                  unsigned StoresToMerge, bool IsConstantSrc, bool UseVector) {
   // Make sure we have something to merge.
-  if (NumElem < 2)
+  if (StoresToMerge < 2)
     return false;
   
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
   unsigned EarliestNodeUsed = 0;
   
-  for (unsigned i=0; i < NumElem; ++i) {
+  for (unsigned i=0; i < StoresToMerge; ++i) {
     // Find a chain for the new wide-store operand. Notice that some
     // of the store nodes that we found may not be selected for inclusion
     // in the wide store. The chain we use needs to be the chain of the
@@ -9754,9 +9754,16 @@
   
   SDValue StoredVal;
   if (UseVector) {
-    // Find a legal type for the vector store.
-    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    bool IsVec = MemVT.isVector();
+    unsigned Elts = StoresToMerge;
+    if (IsVec) {
+      // When merging vector stores, get the total number of elements.
+      Elts *= MemVT.getVectorNumElements();
+    }
+    // Get the type for the merged vector store.
+    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
     assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+
     if (IsConstantSrc) {
       // A vector store with a constant source implies that the constant is
       // zero; we only handle merging stores of constant zeros because the zero
@@ -9766,31 +9773,31 @@
       StoredVal = DAG.getConstant(0, Ty);
     } else {
       SmallVector<SDValue, 8> Ops;
-      for (unsigned i = 0; i < NumElem ; ++i) {
+      for (unsigned i = 0; i < StoresToMerge ; ++i) {
         StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
         SDValue Val = St->getValue();
-        // All of the operands of a BUILD_VECTOR must have the same type.
+        // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type.
         if (Val.getValueType() != MemVT)
           return false;
         Ops.push_back(Val);
       }
-      
       // Build the extracted vector elements back into a vector.
-      StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
+      StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR,
+                              DL, Ty, Ops);
     }
   } else {
     // We should always use a vector store when merging extracted vector
     // elements, so this path implies a store of constants.
     assert(IsConstantSrc && "Merged vector elements should use vector store");
 
-    unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+    unsigned StoreBW = StoresToMerge * ElementSizeBytes * 8;
     APInt StoreInt(StoreBW, 0);
     
     // Construct a single integer constant which is made of the smaller
     // constant inputs.
     bool IsLE = TLI.isLittleEndian();
-    for (unsigned i = 0; i < NumElem ; ++i) {
-      unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
+    for (unsigned i = 0; i < StoresToMerge ; ++i) {
+      unsigned Idx = IsLE ? (StoresToMerge - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
       SDValue Val = St->getValue();
       StoreInt <<= ElementSizeBytes*8;
@@ -9817,7 +9824,7 @@
   // Replace the first store with the new store
   CombineTo(EarliestOp, NewStore);
   // Erase all other stores.
-  for (unsigned i = 0; i < NumElem ; ++i) {
+  for (unsigned i = 0; i < StoresToMerge ; ++i) {
     if (StoreNodes[i].MemNode == EarliestOp)
       continue;
     StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
@@ -9840,25 +9847,35 @@
 }
 
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
-  EVT MemVT = St->getMemoryVT();
-  int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
   bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes().
     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
 
-  // Don't merge vectors into wider inputs.
-  if (MemVT.isVector() || !MemVT.isSimple())
-    return false;
-
   // Perform an early exit check. Do not bother looking at stored values that
   // are not constants, loads, or extracted vector elements.
   SDValue StoredVal = St->getValue();
   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
   bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
                        isa<ConstantFPSDNode>(StoredVal);
-  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+                             StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
    
-  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
+    return false;
+
+  EVT MemVT = St->getMemoryVT();
+  int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
+
+  // Don't merge vectors into wider vectors if the source data comes from loads.
+  // TODO: This restriction can be lifted by using logic similar to the
+  // ExtractVecSrc case.
+  // There's no such thing as a vector constant node; that merge case should be
+  // handled by looking through a BUILD_VECTOR source with all constant inputs.
+  if (MemVT.isVector() && IsLoadSrc)
+    return false;
+  
+  if (!MemVT.isSimple())
     return false;
+  
 
   // Only look at ends of store sequences.
   SDValue Chain = SDValue(St, 0);
@@ -10054,26 +10071,33 @@
 
   // When extracting multiple vector elements, try to store them
   // in one vector store rather than a sequence of scalar stores.
-  if (IsExtractVecEltSrc) {
-    unsigned NumElem = 0;
+  if (IsExtractVecSrc) {
+    unsigned StoresToMerge = 0;
+    bool IsVec = MemVT.isVector();
     for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
+      unsigned StoreValOpcode = St->getValue().getOpcode();
       // This restriction could be loosened.
       // Bail out if any stored values are not elements extracted from a vector.
       // It should be possible to handle mixed sources, but load sources need
       // more careful handling (see the block of code below that handles
       // consecutive loads).
-      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
+          StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
         return false;
       
       // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
-      if (TLI.isTypeLegal(Ty))
-        NumElem = i + 1;
+      unsigned Elts = i + 1;
+      if (IsVec) {
+        // When merging vector stores, get the total number of elements.
+        Elts *= MemVT.getVectorNumElements();
+      }
+      if (TLI.isTypeLegal(EVT::getVectorVT(*DAG.getContext(),
+                                           MemVT.getScalarType(), Elts)))
+        StoresToMerge = i + 1;
     }
 
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, StoresToMerge,
                                            false, true);
   }
 
Index: test/CodeGen/X86/MergeConsecutiveStores.ll
===================================================================
--- test/CodeGen/X86/MergeConsecutiveStores.ll
+++ test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -468,6 +468,64 @@
 ; CHECK-NEXT: retq
 }
 
+; PR21711 - Merge vector stores into wider vector stores.
+define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
+  %idx0 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %idx1 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %idx2 = getelementptr inbounds <4 x float>* %ptr, i64 5
+  %idx3 = getelementptr inbounds <4 x float>* %ptr, i64 6
+  %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
+  store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
+  store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
+  store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
+  ret void
+
+; CHECK-LABEL: merge_vec_extract_stores
+; CHECK:      vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Merging vector stores when sourced from vector loads is not currently handled.
+define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
+  %load_idx0 = getelementptr inbounds <4 x float>* %v, i64 0
+  %load_idx1 = getelementptr inbounds <4 x float>* %v, i64 1
+  %v0 = load <4 x float>* %load_idx0
+  %v1 = load <4 x float>* %load_idx1
+  %store_idx0 = getelementptr inbounds <4 x float>* %ptr, i64 0
+  %store_idx1 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  store <4 x float> %v0, <4 x float>* %store_idx0, align 16
+  store <4 x float> %v1, <4 x float>* %store_idx1, align 16
+  ret void
+
+; CHECK-LABEL: merge_vec_stores_from_loads
+; CHECK:      vmovaps
+; CHECK-NEXT: vmovaps
+; CHECK-NEXT: vmovaps
+; CHECK-NEXT: vmovaps
+; CHECK-NEXT: retq
+}
+
+; Merging vector stores when sourced from a constant vector is not currently handled. 
+define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
+  %idx0 = getelementptr inbounds <4 x i32>* %ptr, i64 3
+  %idx1 = getelementptr inbounds <4 x i32>* %ptr, i64 4
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
+  ret void
+
+; CHECK-LABEL: merge_vec_stores_of_constants
+; CHECK:      vxorps
+; CHECK-NEXT: vmovaps
+; CHECK-NEXT: vmovaps
+; CHECK-NEXT: retq
+}
+
 ; This is a minimized test based on real code that was failing.
 ; We could merge stores (and loads) like this...