Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12404,6 +12404,25 @@
     Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
     return (Ptr.equalBaseIndex(BasePtr));
   };
+
+  // In the case FindBetterChain gives up we will find chains of
+  // consecutive stores. Do a preprocessing step to walk up the chain
+  // leaving the the head of the chain for the parallel search.
+
+  StoreSDNode *STChain = St;
+  int64_t LastOffset = BasePtr.Offset;
+  while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
+    BaseIndexOffset Ptr;
+    //Check Chain.
+    if (!CandidateMatch(Chain, Ptr) || Ptr.Offset > LastOffset)
+      break;
+    StoreNodes.push_back(MemOpLink(STChain, LastOffset));
+    LastOffset = Ptr.Offset;
+    STChain = Chain;
+  }
+
+  SDNode *RootNode = (STChain->getChain()).getNode();
+
   // We looking for a root node which is an ancestor to all mergable
   // stores. We search up through a load, to our root and then down
   // through all children. For instance we will find Store{1,2,3} if
@@ -12415,13 +12434,11 @@
   // |-------|-------|
   // Load    Load    Store3
   // |       |
-  // Store1   Store2
+  // Store1  Store2
   //
   // FIXME: We should be able to climb and
   // descend TokenFactors to find candidates as well.
 
-  SDNode *RootNode = (St->getChain()).getNode();
-
   if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
     RootNode = Ldn->getChain().getNode();
     for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
Index: test/CodeGen/X86/stores-merging2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/stores-merging2.ll
@@ -0,0 +1,44 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+
+
+@ptr = global [32 x i32] zeroinitializer, align 4
+@strptr = global [4 x i8] zeroinitializer, align 1
+
+; FindBetterChains in SelectionDAG should give up and the 4 1-byte
+; stores will remain in a chain. We should still be able to merge them
+; into a 32-bit store.
+
+; CHECK-LABEL: @foo
+; CHECK: movl	$1684234849, strpt
+
+define i32 @foo() {
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 0)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 1)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 2)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 3)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 4)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 5)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 6)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 7)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 8)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 9)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 10)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 11)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 12)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 13)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 14)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 15)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 16)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 17)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 18)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 19)
+  store i8 97, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 0)
+  store i8 98, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 1)
+  store i8 99, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 2)
+  store i8 100, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 3)
+  ret i32 0
+}