diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49961,11 +49961,34 @@ if (!isSuitableCmov(Cmov)) return SDValue(); - // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2) EVT VT = N->getValueType(0); SDLoc DL(N); SDValue FalseOp = Cmov.getOperand(0); SDValue TrueOp = Cmov.getOperand(1); + + // We will push the add through the select, but we can potentially do better + // if we know there is another add in the sequence and this is pointer math. + // In that case, we can absorb an add into the trailing memory op and avoid + // a 3-operand LEA which is likely slower than a 2-operand LEA. + // TODO: If target has "slow3OpsLEA", do this even without the trailing memop? + if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() && + !isa(OtherOp.getOperand(0)) && + all_of(N->uses(), [&](SDNode *Use) { + auto *MemNode = dyn_cast(Use); + return MemNode && MemNode->getBasePtr().getNode() == N; + })) { + // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y + // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but + // it is possible that choosing op1 might be better. + SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1); + FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp); + TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp); + Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, + Cmov.getOperand(2), Cmov.getOperand(3)); + return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y); + } + + // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2) FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp); TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp); return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2), diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll --- a/llvm/test/CodeGen/X86/add-cmov.ll +++ b/llvm/test/CodeGen/X86/add-cmov.ll @@ -279,11 +279,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: leaq (%rsi,%rsi,4), %rax ; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: leaq 66(%rdx), %rcx +; CHECK-NEXT: addq $60, %rdx ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: leaq 60(%rdx,%rax), %rcx -; CHECK-NEXT: leaq 66(%rdx,%rax), %rax -; CHECK-NEXT: cmoveq %rcx, %rax -; CHECK-NEXT: decw (%rax) +; CHECK-NEXT: cmovneq %rcx, %rdx +; CHECK-NEXT: decw (%rdx,%rax) ; CHECK-NEXT: retq %and = and i32 %x, 1 %b = icmp eq i32 %and, 0 @@ -299,11 +299,11 @@ define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) { ; CHECK-LABEL: complex_lea_alt1: ; CHECK: # %bb.0: -; CHECK-NEXT: leaq 60(%rdx,%rsi), %rax -; CHECK-NEXT: leaq 66(%rdx,%rsi), %rcx +; CHECK-NEXT: leaq 60(%rdx), %rax +; CHECK-NEXT: addq $66, %rdx ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: cmovneq %rax, %rcx -; CHECK-NEXT: decw (%rcx) +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: decw (%rdx,%rsi) ; CHECK-NEXT: retq %i = ptrtoint i16* %ptr to i64 %sum = add i64 %idx, %i @@ -320,11 +320,11 @@ define void @complex_lea_alt2(i1 %b, i16* readnone %ptr, i64 %idx) { ; CHECK-LABEL: complex_lea_alt2: ; CHECK: # %bb.0: -; CHECK-NEXT: leaq 60(%rsi,%rdx), %rax -; CHECK-NEXT: leaq 66(%rsi,%rdx), %rcx +; CHECK-NEXT: leaq 60(%rsi), %rax +; CHECK-NEXT: addq $66, %rsi ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: cmovneq %rax, %rcx -; CHECK-NEXT: decw (%rcx) +; CHECK-NEXT: cmovneq %rax, %rsi +; CHECK-NEXT: decw (%rsi,%rdx) ; CHECK-NEXT: retq %i = ptrtoint i16* %ptr to i64 %sum = add i64 %i, %idx @@ -433,11 +433,11 @@ define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) { ; CHECK-LABEL: complex_lea_alt7: ; CHECK: # %bb.0: -; CHECK-NEXT: leaq 60(%rdx,%rsi), %rax -; CHECK-NEXT: leaq 66(%rdx,%rsi), %rcx +; CHECK-NEXT: leaq 60(%rdx), %rax +; CHECK-NEXT: addq $66, %rdx ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: cmovneq %rax, %rcx -; CHECK-NEXT: decw (%rcx) +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: decw (%rdx,%rsi) ; CHECK-NEXT: retq %i = ptrtoint i16* %ptr to i64 %o = add i64 %idx, %i @@ -455,11 +455,11 @@ define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) { ; CHECK-LABEL: complex_lea_alt8: ; CHECK: # %bb.0: -; CHECK-NEXT: leaq 60(%rsi,%rdx), %rax -; CHECK-NEXT: leaq 66(%rsi,%rdx), %rcx +; CHECK-NEXT: leaq 60(%rsi), %rax +; CHECK-NEXT: addq $66, %rsi ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: cmovneq %rax, %rcx -; CHECK-NEXT: decw (%rcx) +; CHECK-NEXT: cmovneq %rax, %rsi +; CHECK-NEXT: decw (%rsi,%rdx) ; CHECK-NEXT: retq %i = ptrtoint i16* %ptr to i64 %o = add i64 %i, %idx