Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2100,27 +2100,46 @@ LoadNode->getOffset() != StoreNode->getOffset()) return false; - // Check if the chain is produced by the load or is a TokenFactor with - // the load output chain as an operand. Return InputChain by reference. - SDValue Chain = StoreNode->getChain(); - - if (Chain == Load.getValue(1)) { - InputChain = LoadNode->getChain(); - return true; - } - - if (Chain.getOpcode() == ISD::TokenFactor) { - // Fusing Load-Op-Store requires predecessors of store must also - // be predecessors of the load. This addition may cause a loop. We - // can check this by doing a search for Load in the new - // dependencies. As this can be expensive, heuristically prune - // this search by visiting the uses and make sure they all have - // smaller node id than the load. - bool FoundLoad = false; SmallVector ChainOps; SmallVector LoopWorklist; SmallPtrSet Visited; + const unsigned int Max = 1024; + + // Visualization of Merge: + // ------------------------- + // Legend: + // *-lines = chain dependencies. + // |-lines = Normal dependencies. + // Dependencies flow down and right. + // + // C X C + // * * * + // * * * + // X A-LD Y1,Y2,... TF Y1,Y2,... + // * * \ | * | + // * * \ | * | + // * * \ | => A--LD_OP_ST + // * * \| \ + // TF OP \ + // * | \ Z1,Z2,... + // * | \ + // A-ST Z1,Z2,... + // + + // For a cycle to happen one of the inputs of the merged nodes must + // be a successor to another merged node. As the merged nodes are a + // chain this reduces to showing that X, A, C, Y1,... are not + // sucessors of LD. As A and C are structurally impossible we can just + // check X and Y. + + SDValue Chain = StoreNode->getChain(); + + // Gather X elements in ChainOps. + if (Chain == Load.getValue(1)) { + FoundLoad = true; + ChainOps.push_back(Load.getOperand(0)); + } else if (Chain.getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -2132,28 +2151,23 @@ LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } + } - if (!FoundLoad) - return false; + if (!FoundLoad) + return false; + // Add Y1, ... + for (SDValue Op : StoredVal->ops()) + if (Op.getNode() != LoadNode) + LoopWorklist.push_back(Op.getNode()); - // If Loop Worklist is not empty. Check if we would make a loop. - if (!LoopWorklist.empty()) { - const unsigned int Max = 8192; - // if Load is predecessor to potentially loop inducing chain - // dependencies. - if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, - Max, true)) - return false; - } + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, + true)) + return false; - // Make a new TokenFactor with all the other input chains except - // for the load. - InputChain = - CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); - return true; + InputChain = + CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; } - return false; -} // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its Index: llvm/test/CodeGen/X86/pr36274.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr36274.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +@g = external local_unnamed_addr global i32, align 4 +@a = external local_unnamed_addr global [13 x [2 x i64]], align 8 + +define void @main() local_unnamed_addr { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addl $1, a +; CHECK-NEXT: adcl $0, a+4 +; CHECK-NEXT: addl $1, a+16 +; CHECK-NEXT: adcl $0, a+20 +; CHECK-NEXT: addl $1, a+32 +; CHECK-NEXT: adcl $0, a+36 +; CHECK-NEXT: addl $1, a+48 +; CHECK-NEXT: adcl $0, a+52 +; CHECK-NEXT: addl $1, a+64 +; CHECK-NEXT: adcl $0, a+68 +; CHECK-NEXT: addl $1, a+80 +; CHECK-NEXT: adcl $0, a+84 +; CHECK-NEXT: addl $1, a+96 +; CHECK-NEXT: adcl $0, a+100 +; CHECK-NEXT: addl $1, a+112 +; CHECK-NEXT: adcl $0, a+116 +; CHECK-NEXT: addl $1, a+128 +; CHECK-NEXT: adcl $0, a+132 +; CHECK-NEXT: addl $1, a+144 +; CHECK-NEXT: adcl $0, a+148 +; CHECK-NEXT: addl $1, a+160 +; CHECK-NEXT: adcl $0, a+164 +; CHECK-NEXT: addl $1, a+176 +; CHECK-NEXT: adcl $0, a+180 +; CHECK-NEXT: movl a+196, %eax +; CHECK-NEXT: addl $1, a+192 +; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, a+196 +; CHECK-NEXT: retl +entry: + %0 = load i32, i32* @g, align 4 + %1 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 0, i32 0), align 8 + %inc = add i64 %1, 1 + store i64 %inc, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 0, i32 0), align 8 + %2 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 1, i32 0), align 8 + %inc.1 = add i64 %2, 1 + store i64 %inc.1, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 1, i32 0), align 8 + %3 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 2, i32 0), align 8 + %inc.2 = add i64 %3, 1 + store i64 %inc.2, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 2, i32 0), align 8 + %4 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 3, i32 0), align 8 + %inc.3 = add i64 %4, 1 + store i64 %inc.3, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 3, i32 0), align 8 + %5 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 4, i32 0), align 8 + %inc.4 = add i64 %5, 1 + store i64 %inc.4, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 4, i32 0), align 8 + %6 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 5, i32 0), align 8 + %inc.5 = add i64 %6, 1 + store i64 %inc.5, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 5, i32 0), align 8 + %7 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 6, i32 0), align 8 + %inc.6 = add i64 %7, 1 + store i64 %inc.6, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 6, i32 0), align 8 + %8 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 7, i32 0), align 8 + %inc.7 = add i64 %8, 1 + store i64 %inc.7, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 7, i32 0), align 8 + %9 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 8, i32 0), align 8 + %inc.8 = add i64 %9, 1 + store i64 %inc.8, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 8, i32 0), align 8 + %10 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 9, i32 0), align 8 + %inc.9 = add i64 %10, 1 + store i64 %inc.9, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 9, i32 0), align 8 + %11 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 10, i32 0), align 8 + %inc.10 = add i64 %11, 1 + store i64 %inc.10, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 10, i32 0), align 8 + %12 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 11, i32 0), align 8 + %inc.11 = add i64 %12, 1 + store i64 %inc.11, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 11, i32 0), align 8 + %13 = load i64, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 12, i32 0), align 8 + %inc.12 = add i64 %13, 1 + store i64 %inc.12, i64* getelementptr inbounds ([13 x [2 x i64]], [13 x [2 x i64]]* @a, i32 0, i32 12, i32 0), align 8 + ret void + }