Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2100,27 +2100,54 @@ LoadNode->getOffset() != StoreNode->getOffset()) return false; - // Check if the chain is produced by the load or is a TokenFactor with - // the load output chain as an operand. Return InputChain by reference. - SDValue Chain = StoreNode->getChain(); - - if (Chain == Load.getValue(1)) { - InputChain = LoadNode->getChain(); - return true; - } - - if (Chain.getOpcode() == ISD::TokenFactor) { - // Fusing Load-Op-Store requires predecessors of store must also - // be predecessors of the load. This addition may cause a loop. We - // can check this by doing a search for Load in the new - // dependencies. As this can be expensive, heuristically prune - // this search by visiting the uses and make sure they all have - // smaller node id than the load. - bool FoundLoad = false; SmallVector ChainOps; SmallVector LoopWorklist; SmallPtrSet Visited; + const unsigned int Max = 1024; + + // Visualization of Merge: + // ------------------------- + // Legend: + // *-lines = chain dependencies. + // |-lines = Normal dependencies. + // Dependencies flow down and right. + // + // C X C + // * * * + // * * * + // X A-LD Y1,Y2,... TF Y1,Y2,... + // * * \ | * | + // * * \ | * | + // * * \ | => A--LD_OP_ST + // * * \| \ + // TF OP \ + // * | \ Z1,Z2,... + // * | \ + // A-ST Z1,Z2,... + // + + // This merge induced a dependence from: #1: X -> LD, OP, Z1,Z2... + // #2: Y1,... -> LD + // #3: ST -> Z1,Z2... + // Check for the dual dependencies to make sure we do not induce a loop. + + // As LD is a predecessor to both OP and ST we can do this by checking: + // * a. if X, Yn are sucessors to LD + // * b. if Zn is a sucessor to ST. + + // We can further simplify b by noting that for ST to be a + // predecessor to Zn X, TF, LD, OP, or Yn must be a predecessor (or + // equal to) to Zn. By construction of the DAG we can reduce this to + // X being _equal_ or a predecessor to Zn. + + SDValue Chain = StoreNode->getChain(); + + // Gather X elements in ChainOps. + if (Chain == Load.getValue(1)) { + FoundLoad = true; + ChainOps.push_back(Load.getOperand(0)); + } else if (Chain.getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -2132,28 +2159,39 @@ LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } + } + + if (!FoundLoad) + return false; + + // Add Yn to worklist. + for (SDValue Op : StoredVal->ops()) + if (Op.getNode() != LoadNode) + LoopWorklist.push_back(Op.getNode()); + + // Check if Load is a predecessor to X + Yn + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, + true)) + return false; + + // Check if Zn is a predecessor (or equal to X). As C and Yn cannot + // be predecessors to ST we can just add all nodes in ChainOps to + // Visited and share the search histories. - if (!FoundLoad) + for (SDValue Op : ChainOps) + Visited.insert(Op.getNode()); + + // Check if ST is a predecessor of X (+C+Yn) + for (auto *Z : StoredVal->uses()) + if (Z != StoreNode && + SDNode::hasPredecessorHelper(Z, Visited, LoopWorklist, Max, true)) return false; - // If Loop Worklist is not empty. Check if we would make a loop. - if (!LoopWorklist.empty()) { - const unsigned int Max = 8192; - // if Load is predecessor to potentially loop inducing chain - // dependencies. - if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, - Max, true)) - return false; - } - // Make a new TokenFactor with all the other input chains except - // for the load. - InputChain = - CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); - return true; + InputChain = + CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; } - return false; -} // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its Index: llvm/test/CodeGen/X86/pr36274.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr36274.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +@g = external local_unnamed_addr global i32, align 4 +@a = external local_unnamed_addr global [13 x [2 x i64]], align 8 + +define void @main() local_unnamed_addr { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addl $1, a +; CHECK-NEXT: adcl $0, a+4 +; CHECK-NEXT: addl $1, a+16 +; CHECK-NEXT: adcl $0, a+20 +; CHECK-NEXT: addl $1, a+32 +; CHECK-NEXT: adcl $0, a+36 +; CHECK-NEXT: addl $1, a+48 +; CHECK-NEXT: adcl $0, a+52 +; CHECK-NEXT: addl $1, a+64 +; CHECK-NEXT: adcl $0, a+68 +; CHECK-NEXT: addl $1, a+80 +; CHECK-NEXT: adcl $0, a+84 +; CHECK-NEXT: addl $1, a+96 +; CHECK-NEXT: adcl $0, a+100 +; CHECK-NEXT: addl $1, a+112 +; CHECK-NEXT: adcl $0, a+116 +; CHECK-NEXT: addl $1, a+128 +; CHECK-NEXT: adcl $0, a+132 +; CHECK-NEXT: addl $1, a+144 +; CHECK-NEXT: adcl $0, a+148 +; CHECK-NEXT: addl $1, a+160 +; CHECK-NEXT: adcl $0, a+164 +; CHECK-NEXT: addl $1, a+176 +; CHECK-NEXT: adcl $0, a+180 +; CHECK-NEXT: movl a+196, %eax +; CHECK-NEXT: addl $1, a+192 +; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, a+196 +; CHECK-NEXT: retl +entry: + %0 = load i32, i32* @g, align 4 + %1 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 0, i32 0), align 8 + %inc = add i64 %1, 1 + store i64 %inc, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 0, i32 0), align 8 + %2 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 1, i32 0), align 8 + %inc.1 = add i64 %2, 1 + store i64 %inc.1, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 1, i32 0), align 8 + %3 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 2, i32 0), align 8 + %inc.2 = add i64 %3, 1 + store i64 %inc.2, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 2, i32 0), align 8 + %4 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 3, i32 0), align 8 + %inc.3 = add i64 %4, 1 + store i64 %inc.3, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 3, i32 0), align 8 + %5 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 4, i32 0), align 8 + %inc.4 = add i64 %5, 1 + store i64 %inc.4, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 4, i32 0), align 8 + %6 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 5, i32 0), align 8 + %inc.5 = add i64 %6, 1 + store i64 %inc.5, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 5, i32 0), align 8 + %7 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 6, i32 0), align 8 + %inc.6 = add i64 %7, 1 + store i64 %inc.6, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 6, i32 0), align 8 + %8 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 7, i32 0), align 8 + %inc.7 = add i64 %8, 1 + store i64 %inc.7, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 7, i32 0), align 8 + %9 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 8, i32 0), align 8 + %inc.8 = add i64 %9, 1 + store i64 %inc.8, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 8, i32 0), align 8 + %10 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 9, i32 0), align 8 + %inc.9 = add i64 %10, 1 + store i64 %inc.9, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 9, i32 0), align 8 + %11 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 10, i32 0), align 8 + %inc.10 = add i64 %11, 1 + store i64 %inc.10, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 10, i32 0), align 8 + %12 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 11, i32 0), align 8 + %inc.11 = add i64 %12, 1 + store i64 %inc.11, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 11, i32 0), align 8 + %13 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 12, i32 0), align 8 + %inc.12 = add i64 %13, 1 + store i64 %inc.12, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 12, i32 0), align 8 + ret void + }