Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -25767,6 +25767,57 @@ return SDValue(); } +/// sext(add_nsw(x, C)) --> add(sext(x), C_sext) +/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities +/// to combine math ops, use an LEA, or use a complex addressing mode. This can +/// eliminate extend, add, and shift instructions. +static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // TODO: This should be valid for other integer types. + EVT VT = Sext->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + // We need an 'add nsw' feeding into the 'sext'. + SDValue Add = Sext->getOperand(0); + if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) + return SDValue(); + + // Having a constant operand to the 'add' ensures that we are not increasing + // the instruction count because the constant is extended for free below. + // A constant operand can also become the displacement field of an LEA. + auto *AddOp1 = dyn_cast(Add.getOperand(1)); + if (!AddOp1) + return SDValue(); + + // Don't make the 'add' bigger if there's no hope of combining it with some + // other 'add' or 'shl' instruction. + // TODO: It may be profitable to generate simpler LEA instructions in place + // of single 'add' instructions, but the cost model for selecting an LEA + // currently has a high threshold. + bool HasLEAPotential = false; + for (auto *User : Sext->uses()) { + if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { + HasLEAPotential = true; + break; + } + } + if (!HasLEAPotential) + return SDValue(); + + // Everything looks good, so pull the 'sext' ahead of the 'add'. + int64_t AddConstant = AddOp1->getSExtValue(); + SDValue AddOp0 = Add.getOperand(0); + SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); + SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); + + // The wider add is guaranteed to not wrap because both operands are + // sign-extended. + SDNodeFlags Flags; + Flags.setNoSignedWrap(true); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -25861,6 +25912,9 @@ if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; + if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) + return NewAdd; + return SDValue(); } Index: llvm/trunk/test/CodeGen/X86/add-nsw-sext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/add-nsw-sext.ll +++ llvm/trunk/test/CodeGen/X86/add-nsw-sext.ll @@ -1,15 +1,14 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; The fundamental problem: an add separated from other arithmetic by a sext can't -; be combined with the later instructions. However, if the first add is 'nsw', +; be combined with the later instructions. However, if the first add is 'nsw', ; then we can promote the sext ahead of that add to allow optimizations. define i64 @add_nsw_consts(i32 %i) { ; CHECK-LABEL: add_nsw_consts: ; CHECK: # BB#0: -; CHECK-NEXT: addl $5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: addq $7, %rax +; CHECK-NEXT: addq $12, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -24,9 +23,8 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) { ; CHECK-LABEL: add_nsw_sext_add: ; CHECK: # BB#0: -; CHECK-NEXT: addl $5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: leaq 5(%rax,%rsi), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -41,9 +39,8 @@ define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) { ; CHECK-LABEL: add_nsw_sext_lsh_add: ; CHECK: # BB#0: -; CHECK-NEXT: addl $-5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq (%rsi,%rax,8), %rax +; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, -5 @@ -73,9 +70,8 @@ define i8* @gep8(i32 %i, i8* %x) { ; CHECK-LABEL: gep8: ; CHECK: # BB#0: -; CHECK-NEXT: addl $5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: leaq 5(%rax,%rsi), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -87,9 +83,8 @@ define i16* @gep16(i32 %i, i16* %x) { ; CHECK-LABEL: gep16: ; CHECK: # BB#0: -; CHECK-NEXT: addl $-5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq (%rsi,%rax,2), %rax +; CHECK-NEXT: leaq -10(%rsi,%rax,2), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, -5 @@ -101,9 +96,8 @@ define i32* @gep32(i32 %i, i32* %x) { ; CHECK-LABEL: gep32: ; CHECK: # BB#0: -; CHECK-NEXT: addl $5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq (%rsi,%rax,4), %rax +; CHECK-NEXT: leaq 20(%rsi,%rax,4), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -115,9 +109,8 @@ define i64* @gep64(i32 %i, i64* %x) { ; CHECK-LABEL: gep64: ; CHECK: # BB#0: -; CHECK-NEXT: addl $-5, %edi ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq (%rsi,%rax,8), %rax +; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, -5 @@ -131,10 +124,9 @@ define i128* @gep128(i32 %i, i128* %x) { ; CHECK-LABEL: gep128: ; CHECK: # BB#0: -; CHECK-NEXT: addl $5, %edi ; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: leaq 80(%rax,%rsi), %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -150,14 +142,10 @@ define void @PR20134(i32* %a, i32 %i) { ; CHECK-LABEL: PR20134: ; CHECK: # BB#0: -; CHECK-NEXT: leal 1(%rsi), %eax -; CHECK-NEXT: cltq -; CHECK-NEXT: movl (%rdi,%rax,4), %eax -; CHECK-NEXT: leal 2(%rsi), %ecx -; CHECK-NEXT: movslq %ecx, %rcx -; CHECK-NEXT: addl (%rdi,%rcx,4), %eax -; CHECK-NEXT: movslq %esi, %rcx -; CHECK-NEXT: movl %eax, (%rdi,%rcx,4) +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx +; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx +; CHECK-NEXT: movl %ecx, (%rdi,%rax,4) ; CHECK-NEXT: retq %add1 = add nsw i32 %i, 1