Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2105,27 +2105,56 @@ LoadNode->getOffset() != StoreNode->getOffset()) return false; - // Check if the chain is produced by the load or is a TokenFactor with - // the load output chain as an operand. Return InputChain by reference. + bool FoundLoad = false; + SmallVector ChainOps; + SmallVector LoopWorklist; + SmallPtrSet Visited; + const unsigned int Max = 1024; + + // Visualization of Load-Op-Store fusion: + // ------------------------- + // Legend: + // *-lines = Chain operand dependencies. + // |-lines = Normal operand dependencies. + // Dependencies flow down and right. n-suffix references multiple nodes. + // + // C Xn C + // * * * + // * * * + // Xn A-LD Yn TF Yn + // * * \ | * | + // * * \ | * | + // * * \ | => A--LD_OP_ST + // * * \| \ + // TF OP \ + // * | \ Zn + // * | \ + // A-ST Zn + // + + // This merge induced dependences from: #1: Xn -> LD, OP, Zn + // #2: Yn -> LD + // #3: ST -> Zn + + // Ensure the transform is safe by checking for the dual + // dependencies to make sure we do not induce a loop. + + // As LD is a predecessor to both OP and ST we can do this by checking: + // a). if LD is a predecessor to a member of Xn or Yn. + // b). if a Zn is a predecessor to ST. + + // However, (b) can only occur through being a chain predecessor to + // ST, which is the same as Zn being a member or predecessor of Xn, + // which is a subset of LD being a predecessor of Xn. So it's + // subsumed by check (a). + SDValue Chain = StoreNode->getChain(); + // Gather X elements in ChainOps. if (Chain == Load.getValue(1)) { - InputChain = LoadNode->getChain(); - return true; - } - - if (Chain.getOpcode() == ISD::TokenFactor) { - // Fusing Load-Op-Store requires predecessors of store must also - // be predecessors of the load. This addition may cause a loop. We - // can check this by doing a search for Load in the new - // dependencies. As this can be expensive, heuristically prune - // this search by visiting the uses and make sure they all have - // smaller node id than the load. - - bool FoundLoad = false; - SmallVector ChainOps; - SmallVector LoopWorklist; - SmallPtrSet Visited; + FoundLoad = true; + ChainOps.push_back(Load.getOperand(0)); + } else if (Chain.getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -2137,31 +2166,25 @@ LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } + } - if (!FoundLoad) - return false; + if (!FoundLoad) + return false; - // If Loop Worklist is not empty. Check if we would make a loop. - if (!LoopWorklist.empty()) { - const unsigned int Max = 8192; - // if Load is predecessor to potentially loop inducing chain - // dependencies. - if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, - Max, true)) - return false; - // Fail conservatively if we ended loop search early. - if (Visited.size() >= Max) - return false; - } + // Worklist is currently Xn. Add Yn to worklist. + for (SDValue Op : StoredVal->ops()) + if (Op.getNode() != LoadNode) + LoopWorklist.push_back(Op.getNode()); - // Make a new TokenFactor with all the other input chains except - // for the load. - InputChain = - CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); - return true; + // Check (a) if Load is a predecessor to Xn + Yn + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, + true)) + return false; + + InputChain = + CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; } - return false; -} // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its Index: llvm/test/CodeGen/X86/pr36274.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr36274.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +; This tests is checking for a case where the x86 load-op-store fusion +; misses a dependence between the fused load and a non-fused operand +; to the load causing a cycle. Here the dependence in question comes +; from the carry in input of the adcl. + +@vx = external local_unnamed_addr global <2 x i32>, align 8 + +define void @pr36274(i32* %somewhere) { +; CHECK-LABEL: pr36274: +; CHECK: # %bb.0: +; CHECK-NEXT: movl vx+4, %eax +; CHECK-NEXT: addl $1, vx +; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, vx+4 +; CHECK-NEXT: retl + %a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0 + %a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1 + %x1 = load volatile i32, i32* %a1, align 4 + %x0 = load volatile i32, i32* %a0, align 8 + %vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0 + %vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1 + %x = bitcast <2 x i32> %vx1 to i64 + %add = add i64 %x, 1 + %vadd = bitcast i64 %add to <2 x i32> + %vx1_0 = extractelement <2 x i32> %vadd, i32 0 + %vx1_1 = extractelement <2 x i32> %vadd, i32 1 + store i32 %vx1_0, i32* %a0, align 8 + store i32 %vx1_1, i32* %a1, align 4 + ret void +}