Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -449,6 +449,11 @@
                           EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
                           bool &NarrowLoad);
 
+    //// Helper function for MergeConsecutiveStores which merges the
+    //// component store chains.
+    SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                unsigned NumStores);
+
     /// This is a helper function for MergeConsecutiveStores. When the source
     /// elements of the consecutive stores are all constants or all extracted
     /// vector elements, try to merge them into one larger store.
@@ -12071,6 +12076,26 @@
   return false;
 }
 
+SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         unsigned NumStores) {
+  SmallVector<SDValue, 8> Chains;
+  SmallPtrSet<const SDNode *, 8> Visited;
+  SDLoc StoreDL(StoreNodes[0].MemNode);
+
+  for (unsigned i = 0; i < NumStores; ++i) {
+    Visited.insert(StoreNodes[i].MemNode);
+  }
+
+  // don't include nodes that are children
+  for (unsigned i = 0; i < NumStores; ++i) {
+    if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
+      Chains.push_back(StoreNodes[i].MemNode->getChain());
+  }
+
+  assert(Chains.size() > 0 && "Chain should have generated a chain");
+  return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
+}
+
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
                   SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
                   unsigned NumStores, bool IsConstantSrc, bool UseVector) {
@@ -12153,17 +12178,8 @@
     StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
   }
 
-  SmallVector<SDValue, 8> Chains;
-
-  // Gather all Chains we're inheriting. As generally all chains are
-  // equal, do minor check to remove obvious redundancies.
-  Chains.push_back(StoreNodes[0].MemNode->getChain());
-  for (unsigned i = 1; i < NumStores; ++i)
-    if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain())
-      Chains.push_back(StoreNodes[i].MemNode->getChain());
-
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
   SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
                                   FirstInChain->getBasePtr(),
                                   FirstInChain->getPointerInfo(),
@@ -12192,6 +12208,54 @@
   if (BasePtr.Base.isUndef())
     return;
 
+  bool IsLoadSrc = isa<LoadSDNode>(St->getValue());
+  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
+                       isa<ConstantFPSDNode>(St->getValue());
+  bool IsExtractVecSrc =
+      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
+  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool {
+    if (Other->isVolatile() || Other->isIndexed())
+      return false;
+    // We can merge constant floats to equivalent integers
+    if (Other->getMemoryVT() != MemVT)
+      if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) &&
+            isa<ConstantFPSDNode>(Other->getValue())))
+        return false;
+    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
+    if (!Ptr.equalBaseIndex(BasePtr))
+      return false;
+    if (IsLoadSrc)
+      return isa<LoadSDNode>(Other->getValue());
+    if (IsConstantSrc)
+      return (MemVT.isInteger() || MemVT == Other->getMemoryVT()) &&
+             (isa<ConstantSDNode>(Other->getValue()) ||
+              isa<ConstantFPSDNode>(Other->getValue()));
+    if (IsExtractVecSrc)
+      return (MemVT == Other->getMemoryVT() &&
+              (Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+               Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR));
+    return false;
+  };
+
+  // In the case FindBetterChain gives up we will find chains of
+  // consecutive stores. Do a preprocessing step to walk up the chain
+  // leaving the the head of the chain for the parallel search.
+
+  StoreSDNode *STChain = St;
+  int64_t LastOffset = BasePtr.Offset;
+  while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
+    BaseIndexOffset Ptr;
+    //Check Chain.
+    if (!CandidateMatch(Chain, Ptr) || Ptr.Offset > LastOffset)
+      break;
+    StoreNodes.push_back(MemOpLink(STChain, LastOffset));
+    LastOffset = Ptr.Offset;
+    STChain = Chain;
+  }
+
+  SDNode *RootNode = (STChain->getChain()).getNode();
+
   // We looking for a root node which is an ancestor to all mergable
   // stores. We search up through a load, to our root and then down
   // through all children. For instance we will find Store{1,2,3} if
@@ -12203,13 +12267,11 @@
   // |-------|-------|
   // Load    Load    Store3
   // |       |
-  // Store1   Store2
+  // Store1  Store2
   //
   // FIXME: We should be able to climb and
   // descend TokenFactors to find candidates as well.
 
-  SDNode *RootNode = (St->getChain()).getNode();
-
   // Set of Parents of Candidates
   std::set<SDNode *> CandidateParents;
 
@@ -12221,39 +12283,13 @@
   } else
     CandidateParents.insert(RootNode);
 
-  bool IsLoadSrc = isa<LoadSDNode>(St->getValue());
-  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
-                       isa<ConstantFPSDNode>(St->getValue());
-  bool IsExtractVecSrc =
-      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
-       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
-  auto CorrectValueKind = [&](StoreSDNode *Other) -> bool {
-    if (IsLoadSrc)
-      return isa<LoadSDNode>(Other->getValue());
-    if (IsConstantSrc)
-      return (isa<ConstantSDNode>(Other->getValue()) ||
-              isa<ConstantFPSDNode>(Other->getValue()));
-    if (IsExtractVecSrc)
-      return (Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
-              Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
-    return false;
-  };
-
   // check all parents of mergable children
   for (auto P = CandidateParents.begin(); P != CandidateParents.end(); ++P)
     for (auto I = (*P)->use_begin(), E = (*P)->use_end(); I != E; ++I)
       if (I.getOperandNo() == 0)
         if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
-          if (OtherST->isVolatile() || OtherST->isIndexed())
-            continue;
-          // We can merge constant floats to equivalent integers
-          if (OtherST->getMemoryVT() != MemVT)
-            if (!(MemVT.isInteger() && MemVT.bitsEq(OtherST->getMemoryVT()) &&
-                  isa<ConstantFPSDNode>(OtherST->getValue())))
-              continue;
-          BaseIndexOffset Ptr =
-              BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
-          if (Ptr.equalBaseIndex(BasePtr) && CorrectValueKind(OtherST))
+          BaseIndexOffset Ptr;
+          if (CandidateMatch(OtherST, Ptr))
             StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
         }
 }
@@ -12494,9 +12530,6 @@
     LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
     if (!Ld) break;
 
-    // Loads must only have one use.
-    if (!Ld->hasNUsesOfValue(1, 0))
-      break;
 
     // The memory operands must not be volatile.
     if (Ld->isVolatile() || Ld->isIndexed())
@@ -12506,9 +12539,6 @@
     if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
       break;
 
-    // The stored memory type must be the same.
-    if (Ld->getMemoryVT() != MemVT)
-      break;
 
     BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
     // If this is not the first ptr that we check.
@@ -12611,14 +12641,6 @@
   if (NumElem < 2)
     return false;
 
-  // Collect the chains from all merged stores. Because the common case
-  // all chains are the same, check if we match the first Chain.
-  SmallVector<SDValue, 8> MergeStoreChains;
-  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
-  for (unsigned i = 1; i < NumElem; ++i)
-    if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain())
-      MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
-
   // Find if it is better to use vectors or integers to load and store
   // to memory.
   EVT JointMemOpVT;
@@ -12638,8 +12660,7 @@
                                 FirstLoad->getBasePtr(),
                                 FirstLoad->getPointerInfo(), FirstLoadAlign);
 
-  SDValue NewStoreChain =
-    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
+  SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
 
   AddToWorklist(NewStoreChain.getNode());
 
@@ -12650,8 +12671,24 @@
   // Transfer chain users from old loads to the new load.
   for (unsigned i = 0; i < NumElem; ++i) {
     LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                  SDValue(NewLoad.getNode(), 1));
+    if (SDValue(Ld, 0).hasOneUse()) {
+      // Only the original store used value so just replace chain.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                    SDValue(NewLoad.getNode(), 1));
+    } else {
+      // Multiple uses exist. Keep the old load in line with the new load.
+      SDValue Token0 =
+          DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1),
+                      SDValue(NewLoad.getNode(), 1));
+      // Don't cleanup Ld yet. This changes Token0 first argument to itself.
+      CombineTo(Ld, SDValue(Ld, 0), Token0, false);
+      SDValue Token =
+          DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1),
+                      SDValue(NewLoad.getNode(), 1));
+      // Reset Token0's input from itself to Ld's output chain.
+      CombineTo(Token0.getNode(), Token);
+      AddToWorklist(Ld);
+    }
   }
 
   // Replace the all stores with the new store.
Index: test/CodeGen/X86/merge_store_duplicated_loads.ll
===================================================================
--- test/CodeGen/X86/merge_store_duplicated_loads.ll
+++ test/CodeGen/X86/merge_store_duplicated_loads.ll
@@ -1,18 +1,16 @@
+
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -o - | FileCheck %s
 
-
+; PR32086
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @merge_double(double* noalias nocapture %st, double* noalias nocapture readonly %ld) #0 {
 ; CHECK-LABEL: merge_double:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd %xmm0, (%rdi)
-; CHECK-NEXT:    movsd %xmm1, 8(%rdi)
-; CHECK-NEXT:    movsd %xmm0, 16(%rdi)
-; CHECK-NEXT:    movsd %xmm1, 24(%rdi)
+; CHECK-NEXT:    movups (%rsi), %xmm0
+; CHECK-NEXT:    movups %xmm0, (%rdi)
+; CHECK-NEXT:    movups %xmm0, 16(%rdi)
 ; CHECK-NEXT:    retq
   %ld_idx1 = getelementptr inbounds double, double* %ld, i64 1
   %ld0 = load double, double* %ld, align 8, !tbaa !2
@@ -32,12 +30,9 @@
 define void @merge_loadstore_int(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 {
 ; CHECK-LABEL: merge_loadstore_int:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    movq 8(%rdi), %rcx
-; CHECK-NEXT:    movq %rax, (%rsi)
-; CHECK-NEXT:    movq %rcx, 8(%rsi)
-; CHECK-NEXT:    movq %rax, 16(%rsi)
-; CHECK-NEXT:    movq %rcx, 24(%rsi)
+; CHECK-NEXT:    movups (%rdi), %xmm0
+; CHECK-NEXT:    movups %xmm0, (%rsi)
+; CHECK-NEXT:    movups %xmm0, 16(%rsi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i64, i64* %p, align 8, !tbaa !1
@@ -57,11 +52,9 @@
 ; CHECK-LABEL: merge_loadstore_int_with_extra_use:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    movq 8(%rdi), %rcx
-; CHECK-NEXT:    movq %rax, (%rsi)
-; CHECK-NEXT:    movq %rcx, 8(%rsi)
-; CHECK-NEXT:    movq %rax, 16(%rsi)
-; CHECK-NEXT:    movq %rcx, 24(%rsi)
+; CHECK-NEXT:    movups (%rdi), %xmm0
+; CHECK-NEXT:    movups %xmm0, (%rsi)
+; CHECK-NEXT:    movups %xmm0, 16(%rsi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i64, i64* %p, align 8, !tbaa !1
@@ -75,7 +68,6 @@
   %arrayidx5 = getelementptr inbounds i64, i64* %q, i64 3
   store i64 %1, i64* %arrayidx5, align 8, !tbaa !1
   ret i64 %0
-
 }
 
 attributes #0 = { "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
Index: test/CodeGen/X86/stores-merging2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/stores-merging2.ll
@@ -0,0 +1,44 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+
+
+@ptr = global [32 x i32] zeroinitializer, align 4
+@strptr = global [4 x i8] zeroinitializer, align 1
+
+; FindBetterChains in SelectionDAG should give up and the 4 1-byte
+; stores will remain in a chain. We should still be able to merge them
+; into a 32-bit store.
+
+; CHECK-LABEL: @foo
+; CHECK: movl	$1684234849, strpt
+
+define i32 @foo() {
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 0)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 1)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 2)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 3)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 4)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 5)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 6)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 7)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 8)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 9)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 10)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 11)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 12)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 13)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 14)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 15)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 16)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 17)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 18)
+  store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 19)
+  store i8 97, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 0)
+  store i8 98, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 1)
+  store i8 99, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 2)
+  store i8 100, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 3)
+  ret i32 0
+}