Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2095,12 +2095,23 @@ // the load output chain as an operand. Return InputChain by reference. SDValue Chain = StoreNode->getChain(); - bool ChainCheck = false; if (Chain == Load.getValue(1)) { - ChainCheck = true; InputChain = LoadNode->getChain(); - } else if (Chain.getOpcode() == ISD::TokenFactor) { + return true; + } + + if (Chain.getOpcode() == ISD::TokenFactor) { + // Fusing Load-Op-Store requires predecessors of store must also + // be predecessors of the load. This addition may cause a loop. We + // can check this by doing a search for Load in the new + // dependencies. As this can be expensive, heuristically prune + // this search by visiting the uses and make sure they all have + // smaller node id than the load. + + bool ChainCheck = false; SmallVector ChainOps; + SmallVector LoopWorklist; + SmallPtrSet Visited; for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -2109,33 +2120,32 @@ ChainOps.push_back(Load.getOperand(0)); continue; } + LoopWorklist.push_back(Op.getNode()); + ChainOps.push_back(Op); + } - // Make sure using Op as part of the chain would not cause a cycle here. - // In theory, we could check whether the chain node is a predecessor of - // the load. But that can be very expensive. Instead visit the uses and - // make sure they all have smaller node id than the load. - int LoadId = LoadNode->getNodeId(); - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = UI->use_end(); UI != UE; ++UI) { - if (UI.getUse().getResNo() != 0) - continue; - if (UI->getNodeId() > LoadId) + // If Loop Worklist is not empty. Check if we would make a loop. + if (ChainCheck) { + if (!LoopWorklist.empty()) { + unsigned int Max = 8192; + // if Load is predecessor to potentially loop inducing chain + // dependencies. + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, + Max)) + return false; + // Fail conservatively if we ended loop search early. + if (Visited.size() >= Max) return false; } - ChainOps.push_back(Op); - } - - if (ChainCheck) // Make a new TokenFactor with all the other input chains except // for the load. InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; + } } - if (!ChainCheck) return false; - - return true; } // Change a chain of {load; op; store} of the same value into a simple op @@ -2365,6 +2375,8 @@ MemOp[1] = LoadNode->getMemOperand(); Result->setMemRefs(MemOp, MemOp + 2); + // Update Load Chain uses as well. + ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); CurDAG->RemoveDeadNode(Node); Index: llvm/test/CodeGen/X86/i256-add.ll =================================================================== --- llvm/test/CodeGen/X86/i256-add.ll +++ llvm/test/CodeGen/X86/i256-add.ll @@ -9,40 +9,30 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $12, %esp +; X32-NEXT: subl $8, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl 16(%eax), %edi +; X32-NEXT: movl 12(%eax), %ebx +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %edi -; X32-NEXT: movl (%ecx), %edx -; X32-NEXT: movl 4(%ecx), %ebx -; X32-NEXT: movl 28(%eax), %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ebp -; X32-NEXT: addl (%eax), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: adcl 4(%eax), %ebx -; X32-NEXT: adcl 8(%eax), %edi -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %edi -; X32-NEXT: movl 12(%eax), %edx -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: adcl 12(%ecx), %edx -; X32-NEXT: adcl 16(%ecx), %esi -; X32-NEXT: adcl 20(%ecx), %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: adcl 24(%ecx), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload -; X32-NEXT: adcl %ebp, 28(%ecx) -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, 8(%ecx) -; X32-NEXT: movl %ebx, 4(%ecx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl %edx, 12(%ecx) -; X32-NEXT: movl %esi, 16(%ecx) -; X32-NEXT: movl %edi, 20(%ecx) -; X32-NEXT: movl %eax, 24(%ecx) -; X32-NEXT: addl $12, %esp +; X32-NEXT: addl %ecx, (%eax) +; X32-NEXT: adcl %edx, 4(%eax) +; X32-NEXT: adcl %ebp, 8(%eax) +; X32-NEXT: adcl %ebx, 12(%eax) +; X32-NEXT: adcl %edi, 16(%eax) +; X32-NEXT: adcl %esi, 20(%eax) +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, 24(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, 28(%eax) +; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -51,17 +41,14 @@ ; ; X64-LABEL: add: ; X64: # %bb.0: -; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq 8(%rdi), %rdx -; X64-NEXT: movq 24(%rsi), %r8 -; X64-NEXT: addq (%rsi), %rcx -; X64-NEXT: adcq 8(%rsi), %rdx -; X64-NEXT: adcq 16(%rsi), %rax -; X64-NEXT: adcq %r8, 24(%rdi) -; X64-NEXT: movq %rax, 16(%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) -; X64-NEXT: movq %rcx, (%rdi) +; X64-NEXT: movq 24(%rsi), %rax +; X64-NEXT: movq 16(%rsi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq 8(%rsi), %rsi +; X64-NEXT: addq %rdx, (%rdi) +; X64-NEXT: adcq %rsi, 8(%rdi) +; X64-NEXT: adcq %rcx, 16(%rdi) +; X64-NEXT: adcq %rax, 24(%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q @@ -77,35 +64,28 @@ ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $8, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %eax -; X32-NEXT: movl 12(%ecx), %edx -; X32-NEXT: movl 8(%ecx), %edi -; X32-NEXT: movl (%ecx), %ebx -; X32-NEXT: movl 4(%ecx), %ebp -; X32-NEXT: subl (%esi), %ebx -; X32-NEXT: sbbl 4(%esi), %ebp -; X32-NEXT: sbbl 8(%esi), %edi -; X32-NEXT: sbbl 12(%esi), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: sbbl 16(%esi), %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%ecx), %edx -; X32-NEXT: sbbl 20(%esi), %edx -; X32-NEXT: movl 24(%ecx), %eax -; X32-NEXT: sbbl 24(%esi), %eax -; X32-NEXT: movl 28(%esi), %esi -; X32-NEXT: sbbl %esi, 28(%ecx) -; X32-NEXT: movl %edi, 8(%ecx) -; X32-NEXT: movl %ebp, 4(%ecx) -; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 12(%ecx) -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 16(%ecx) -; X32-NEXT: movl %edx, 20(%ecx) -; X32-NEXT: movl %eax, 24(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl 16(%eax), %edi +; X32-NEXT: movl 12(%eax), %ebx +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: subl %ecx, (%eax) +; X32-NEXT: sbbl %edx, 4(%eax) +; X32-NEXT: sbbl %ebp, 8(%eax) +; X32-NEXT: sbbl %ebx, 12(%eax) +; X32-NEXT: sbbl %edi, 16(%eax) +; X32-NEXT: sbbl %esi, 20(%eax) +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: sbbl %ecx, 24(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X32-NEXT: sbbl %ecx, 28(%eax) ; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -115,17 +95,14 @@ ; ; X64-LABEL: sub: ; X64: # %bb.0: -; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq 8(%rdi), %rdx -; X64-NEXT: movq 24(%rsi), %r8 -; X64-NEXT: subq (%rsi), %rcx -; X64-NEXT: sbbq 8(%rsi), %rdx -; X64-NEXT: sbbq 16(%rsi), %rax -; X64-NEXT: sbbq %r8, 24(%rdi) -; X64-NEXT: movq %rax, 16(%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) -; X64-NEXT: movq %rcx, (%rdi) +; X64-NEXT: movq 24(%rsi), %rax +; X64-NEXT: movq 16(%rsi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq 8(%rsi), %rsi +; X64-NEXT: subq %rdx, (%rdi) +; X64-NEXT: sbbq %rsi, 8(%rdi) +; X64-NEXT: sbbq %rcx, 16(%rdi) +; X64-NEXT: sbbq %rax, 24(%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q Index: llvm/test/CodeGen/X86/load-op-store-fusion.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/load-op-store-fusion.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 + +; This test makes sure we do not merge both load-op-store pairs here as it causes a cycle. + +define i8* @fn(i32 %i.015.i, [64 x i64]* %data.i) { +; X32-LABEL: fn: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx,%eax,8), %edx +; X32-NEXT: addl $1, %edx +; X32-NEXT: adcl $0, 4(%ecx,%eax,8) +; X32-NEXT: movl %edx, (%ecx,%eax,8) +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: retl +; +; X64-LABEL: fn: +; X64: # %bb.0: # %entry +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: incq (%rsi,%rax,8) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +entry: + %arrayidx.i6 = getelementptr inbounds [64 x i64], [64 x i64]* %data.i, i32 0, i32 %i.015.i + %x8 = load volatile i64, i64* %arrayidx.i6, align 8 + %inc.i7 = add i64 %x8, 1 + store volatile i64 %inc.i7, i64* %arrayidx.i6, align 8 + ret i8* null +} +