Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2105,27 +2105,60 @@ LoadNode->getOffset() != StoreNode->getOffset()) return false; - // Check if the chain is produced by the load or is a TokenFactor with - // the load output chain as an operand. Return InputChain by reference. + bool FoundLoad = false; + SmallVector ChainOps; + SmallVector LoopWorklist; + SmallPtrSet Visited; + const unsigned int Max = 1024; + + // Visualization of Load-Op-Store fusion: + // ------------------------- + // Legend: + // *-lines = Chain operand dependencies. + // |-lines = Normal operand dependencies. + // Dependencies flow down and right. n-suffix references multiple nodes. + // + // C Xn C + // * * * + // * * * + // Xn A-LD Yn TF Yn + // * * \ | * | + // * * \ | * | + // * * \ | => A--LD_OP_ST + // * * \| \ + // TF OP \ + // * | \ Zn + // * | \ + // A-ST Zn + // + + // This merge induced dependences from: #1: Xn -> LD, OP, Zn + // #2: Yn -> LD + // #3: ST -> Zn + // Check for the dual dependencies to make sure we do not induce a loop. + + // As LD is a predecessor to both OP and ST we can do this by checking: + // a). if LD is a predecessor to a memeber of Xn + Yn. + // b). if a Zn is a predecessor to ST (i.e. a chain successor to a value + // successor of Op) + + // If a node N1 is a predecessor to N2 then it must also be a predecessor (or + // equal) to at least 1 operand of N2. Using this (and that Zn cannot be TF) + // we can rephrase (b) as: + + // b'). if a Zn is a predecessor (or equal to) A + Xn + LD + OP + + // As LD, Op, A are predecessors to Zn this is equivalent to: + + // b''). Zn is a predecessor (or equal to) to Xn. + SDValue Chain = StoreNode->getChain(); + // Gather X elements in ChainOps. if (Chain == Load.getValue(1)) { - InputChain = LoadNode->getChain(); - return true; - } - - if (Chain.getOpcode() == ISD::TokenFactor) { - // Fusing Load-Op-Store requires predecessors of store must also - // be predecessors of the load. This addition may cause a loop. We - // can check this by doing a search for Load in the new - // dependencies. As this can be expensive, heuristically prune - // this search by visiting the uses and make sure they all have - // smaller node id than the load. - - bool FoundLoad = false; - SmallVector ChainOps; - SmallVector LoopWorklist; - SmallPtrSet Visited; + FoundLoad = true; + ChainOps.push_back(Load.getOperand(0)); + } else if (Chain.getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -2137,31 +2170,38 @@ LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } + } - if (!FoundLoad) - return false; + if (!FoundLoad) + return false; - // If Loop Worklist is not empty. Check if we would make a loop. - if (!LoopWorklist.empty()) { - const unsigned int Max = 8192; - // if Load is predecessor to potentially loop inducing chain - // dependencies. - if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, - Max, true)) - return false; - // Fail conservatively if we ended loop search early. - if (Visited.size() >= Max) - return false; - } + // Worklist is currently Xn. Add Yn to worklist. + for (SDValue Op : StoredVal->ops()) + if (Op.getNode() != LoadNode) + LoopWorklist.push_back(Op.getNode()); - // Make a new TokenFactor with all the other input chains except - // for the load. - InputChain = - CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); - return true; + // Check (a) if Load is a predecessor to Xn + Yn + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, + true)) + return false; + + // Check (b''). As Yn cannot be a predecessor to Zn, it is safe to reuse + // the search history (Visited). As we check equality as well, we must first + // add Xn to the Visited list. We add elements of ChainOps which also has C. + // This is safe as C is already a predecessor to all elements of Zn. + + for (SDValue Op : ChainOps) + Visited.insert(Op.getNode()); + + for (auto *Z : StoredVal->uses()) + if (Z != StoreNode && + SDNode::hasPredecessorHelper(Z, Visited, LoopWorklist, Max, true)) + return false; + + InputChain = + CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; } - return false; -} // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its Index: llvm/test/CodeGen/X86/pr36274.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr36274.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +@x = external local_unnamed_addr global i64, align 8 + +define void @pr36274() { +; CHECK-LABEL: pr36274: +; CHECK: # %bb.0: +; CHECK-NEXT: movl x+4, %eax +; CHECK-NEXT: addl $1, x +; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, x+4 +; CHECK-NEXT: retl + %x = load volatile i64, i64* @x, align 8 + %x1 = add i64 %x, 1 + %vx1 = bitcast i64 %x1 to <2 x i32> + %vx1_0 = extractelement <2 x i32> %vx1, i32 0 + %vx1_1 = extractelement <2 x i32> %vx1, i32 1 + %p32x = bitcast i64* @x to <2 x i32>* + %a32_0 = getelementptr <2 x i32>, <2 x i32>* %p32x, i32 0, i32 0 + store volatile i32 %vx1_0, i32* %a32_0, align 8 + %a32_1 = getelementptr <2 x i32>, <2 x i32>* %p32x, i32 0, i32 1 + store volatile i32 %vx1_1, i32* %a32_1, align 4 + ret void +}