Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2100,27 +2100,46 @@
       LoadNode->getOffset() != StoreNode->getOffset())
     return false;
 
-  // Check if the chain is produced by the load or is a TokenFactor with
-  // the load output chain as an operand. Return InputChain by reference.
-  SDValue Chain = StoreNode->getChain();
-
-  if (Chain == Load.getValue(1)) {
-    InputChain = LoadNode->getChain();
-    return true;
-  }
-
-  if (Chain.getOpcode() == ISD::TokenFactor) {
-    // Fusing Load-Op-Store requires predecessors of store must also
-    // be predecessors of the load. This addition may cause a loop. We
-    // can check this by doing a search for Load in the new
-    // dependencies. As this can be expensive, heuristically prune
-    // this search by visiting the uses and make sure they all have
-    // smaller node id than the load.
-
   bool FoundLoad = false;
   SmallVector<SDValue, 4> ChainOps;
   SmallVector<const SDNode *, 4> LoopWorklist;
   SmallPtrSet<const SDNode *, 16> Visited;
+  const unsigned int Max = 1024;
+
+  //  Visualization of Merge:
+  // -------------------------
+  // Legend:
+  //    *-lines = chain dependencies.
+  //    |-lines = Normal dependencies.
+  //    Dependencies flow down and right.
+  //
+  //        C                        X   C
+  //        *                         *  *
+  //        *                          * *
+  //  X   A-LD    Y1,Y2,...             TF         Y1,Y2,...
+  //   *    * \   |                       *        |
+  //    *   *  \  |                        *       |
+  //     *  *   \ |             =>       A--LD_OP_ST
+  //      * *    \|                                 \
+  //       TF    OP                                  \
+  //         *   | \                                  Z1,Z2,...
+  //          *  |  \
+  //         A-ST    Z1,Z2,...
+  //
+
+  // For a cycle to happen one of the inputs of the merged nodes must
+  // be a successor to another merged node. As the merged nodes are a
+  // chain this reduces to showing that X, A, C, Y1,... are not
+  // sucessors of LD. As A and C are structurally impossible we can just
+  // check X and Y.
+
+  SDValue Chain = StoreNode->getChain();
+
+  // Gather X elements in ChainOps.
+  if (Chain == Load.getValue(1)) {
+    FoundLoad = true;
+    ChainOps.push_back(Load.getOperand(0));
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
       SDValue Op = Chain.getOperand(i);
       if (Op == Load.getValue(1)) {
@@ -2132,28 +2151,23 @@
       LoopWorklist.push_back(Op.getNode());
       ChainOps.push_back(Op);
     }
+  }
 
-    if (!FoundLoad)
-      return false;
+  if (!FoundLoad)
+    return false;
+  // Add Y1, ...
+  for (SDValue Op : StoredVal->ops())
+    if (Op.getNode() != LoadNode)
+      LoopWorklist.push_back(Op.getNode());
 
-    // If Loop Worklist is not empty. Check if we would make a loop.
-    if (!LoopWorklist.empty()) {
-      const unsigned int Max = 8192;
-      // if Load is predecessor to potentially loop inducing chain
-      // dependencies.
-      if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist,
-                                       Max, true))
-        return false;
-    }
+  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+                                   true))
+    return false;
 
-    // Make a new TokenFactor with all the other input chains except
-    // for the load.
-    InputChain =
-        CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
-    return true;
+  InputChain =
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
+  return true;
   }
-  return false;
-}
 
 // Change a chain of {load; op; store} of the same value into a simple op
 // through memory of that value, if the uses of the modified value and its
Index: llvm/test/CodeGen/X86/pr36274.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/pr36274.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+
+@g = external local_unnamed_addr global i32, align 4
+@a = external local_unnamed_addr global [13 x [2 x i64]], align 8
+
+define void @main() local_unnamed_addr  {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl $1, a
+; CHECK-NEXT:    adcl $0, a+4
+; CHECK-NEXT:    addl $1, a+16
+; CHECK-NEXT:    adcl $0, a+20
+; CHECK-NEXT:    addl $1, a+32
+; CHECK-NEXT:    adcl $0, a+36
+; CHECK-NEXT:    addl $1, a+48
+; CHECK-NEXT:    adcl $0, a+52
+; CHECK-NEXT:    addl $1, a+64
+; CHECK-NEXT:    adcl $0, a+68
+; CHECK-NEXT:    addl $1, a+80
+; CHECK-NEXT:    adcl $0, a+84
+; CHECK-NEXT:    addl $1, a+96
+; CHECK-NEXT:    adcl $0, a+100
+; CHECK-NEXT:    addl $1, a+112
+; CHECK-NEXT:    adcl $0, a+116
+; CHECK-NEXT:    addl $1, a+128
+; CHECK-NEXT:    adcl $0, a+132
+; CHECK-NEXT:    addl $1, a+144
+; CHECK-NEXT:    adcl $0, a+148
+; CHECK-NEXT:    addl $1, a+160
+; CHECK-NEXT:    adcl $0, a+164
+; CHECK-NEXT:    addl $1, a+176
+; CHECK-NEXT:    adcl $0, a+180
+; CHECK-NEXT:    movl a+196, %eax
+; CHECK-NEXT:    addl $1, a+192
+; CHECK-NEXT:    adcl $0, %eax
+; CHECK-NEXT:    movl %eax, a+196
+; CHECK-NEXT:    retl
+entry:
+  %0 = load i32, i32* @g, align 4
+  %1 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 0, i32 0), align 8
+  %inc = add i64 %1, 1
+  store i64 %inc, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 0, i32 0), align 8
+  %2 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 1, i32 0), align 8
+  %inc.1 = add i64 %2, 1
+  store i64 %inc.1, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 1, i32 0), align 8
+  %3 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 2, i32 0), align 8
+  %inc.2 = add i64 %3, 1
+  store i64 %inc.2, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 2, i32 0), align 8
+  %4 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 3, i32 0), align 8
+  %inc.3 = add i64 %4, 1
+  store i64 %inc.3, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 3, i32 0), align 8
+  %5 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 4, i32 0), align 8
+  %inc.4 = add i64 %5, 1
+  store i64 %inc.4, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 4, i32 0), align 8
+  %6 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 5, i32 0), align 8
+  %inc.5 = add i64 %6, 1
+  store i64 %inc.5, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 5, i32 0), align 8
+  %7 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 6, i32 0), align 8
+  %inc.6 = add i64 %7, 1
+  store i64 %inc.6, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 6, i32 0), align 8
+  %8 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 7, i32 0), align 8
+  %inc.7 = add i64 %8, 1
+  store i64 %inc.7, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 7, i32 0), align 8
+  %9 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 8, i32 0), align 8
+  %inc.8 = add i64 %9, 1
+  store i64 %inc.8, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 8, i32 0), align 8
+  %10 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 9, i32 0), align 8
+  %inc.9 = add i64 %10, 1
+  store i64 %inc.9, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 9, i32 0), align 8
+  %11 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 10, i32 0), align 8
+  %inc.10 = add i64 %11, 1
+  store i64 %inc.10, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 10, i32 0), align 8
+  %12 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 11, i32 0), align 8
+  %inc.11 = add i64 %12, 1
+  store i64 %inc.11, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 11, i32 0), align 8
+  %13 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 12, i32 0), align 8
+  %inc.12 = add i64 %13, 1
+  store i64 %inc.12, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 12, i32 0), align 8
+  ret void
+  }