Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23971,11 +23971,29 @@ } } +static int getSmallDivisor(uint64_t Val) { + // Note that order is important - 9 must come before 3. + if (Val % 9 == 0) + return 9; + if (Val % 5 == 0) + return 5; + if (Val % 3 == 0) + return 3; + return 0; +} + /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + + // The sequences we want to produce are larger than in imul, + // disable this for -Oz + if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize)) + return SDValue(); + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -23986,51 +24004,62 @@ ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); - uint64_t MulAmt = C->getZExtValue(); - if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) + + bool IsNeg = C->getSExtValue() < 0; + uint64_t MulAmt = IsNeg ? -C->getSExtValue() : C->getSExtValue(); + + // This is handled by the target-independent DAGCombine + if (!MulAmt || isPowerOf2_64(MulAmt)) return SDValue(); - uint64_t MulAmt1 = 0; - uint64_t MulAmt2 = 0; - if ((MulAmt % 9) == 0) { - MulAmt1 = 9; - MulAmt2 = MulAmt / 9; - } else if ((MulAmt % 5) == 0) { - MulAmt1 = 5; - MulAmt2 = MulAmt / 5; - } else if ((MulAmt % 3) == 0) { - MulAmt1 = 3; - MulAmt2 = MulAmt / 3; - } - if (MulAmt2 && - (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - SDLoc DL(N); + unsigned ShiftAmt = countTrailingZeros(MulAmt); + if (ShiftAmt) + MulAmt >>= ShiftAmt; + + // How many steps will we have to peform to replace the + // MUL. We limit this to 3 steps, based on imul latency of 4 + // (If the latency is equal, this is probably not a win) + unsigned Steps = 0; + const unsigned MAX_STEPS = 3; + // Having to negate or shift is a step + Steps += (IsNeg ? 1 : 0); + Steps += (ShiftAmt ? 1 : 0); + + // Finalize multiplications by interegers whose only factors are in + // {3, 5, 9, 2 ^ C} + SDValue NewMul = N->getOperand(0); + SDLoc DL(N); - if (isPowerOf2_64(MulAmt2) && - !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) - // If second multiplifer is pow2, issue it first. We want the multiply by - // 3, 5, or 9 to be folded into the addressing mode unless the lone use - // is an add. - std::swap(MulAmt1, MulAmt2); - - SDValue NewMul; - if (isPowerOf2_64(MulAmt1)) - NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); - else - NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(MulAmt1, VT)); + // First, we shift, then we multiply by the small constant, with the + // expectation that the small constant multiplication will become a LEA. + // But if the original mul has one use, and that use is an ADD, then we + // may get better code from the opposite order, since that ADD may be + // part of an addressing mode calculation. + bool MulFirst = N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD; + + if (ShiftAmt && !MulFirst) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(ShiftAmt, MVT::i8)); + + unsigned Val = getSmallDivisor(MulAmt); + for (; Val && (Steps < MAX_STEPS) && (MulAmt != 1); ++Steps) { + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, + DAG.getConstant(Val, VT)); + MulAmt /= Val; + Val = getSmallDivisor(MulAmt); + } + + if (ShiftAmt && MulFirst) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(ShiftAmt, MVT::i8)); - if (isPowerOf2_64(MulAmt2)) - NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, - DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); - else - NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, - DAG.getConstant(MulAmt2, VT)); + if (IsNeg) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), NewMul); - // Do not add new nodes to DAG combiner worklist. + // Do not add new nodes to DAG combiner worklist. + if (MulAmt == 1) DCI.CombineTo(N, NewMul, false); - } + return SDValue(); } Index: test/CodeGen/X86/imul-lea-2.ll =================================================================== --- test/CodeGen/X86/imul-lea-2.ll +++ test/CodeGen/X86/imul-lea-2.ll @@ -1,19 +0,0 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s - -; CHECK-NOT: imul - -define i64 @t1(i64 %a) nounwind readnone { -entry: - %0 = mul i64 %a, 81 -; CHECK: lea -; CHECK: lea - ret i64 %0 -} - -define i64 @t2(i64 %a) nounwind readnone { -entry: - %0 = mul i64 %a, 40 -; CHECK: shl -; CHECK: lea - ret i64 %0 -} Index: test/CodeGen/X86/imul.ll =================================================================== --- test/CodeGen/X86/imul.ll +++ test/CodeGen/X86/imul.ll @@ -69,8 +69,7 @@ ; X64-LABEL: mul3_32: ; X64: leal ; X86-LABEL: mul3_32: -; But why?! -; X86: imull +; X86: leal %mul = mul i32 %A, 3 ret i32 %mul } @@ -79,8 +78,10 @@ ; X64-LABEL: mul3_64: ; X64: leaq ; X86-LABEL: mul3_64: -; X86: mull -; X86-NEXT: imull +; X86: leal +; X86-NEXT: movl +; X86-NEXT: mull +; X86-NEXT: addl %mul = mul i64 %A, 3 ret i64 %mul } @@ -108,3 +109,54 @@ %mul = mul i64 %A, 40 ret i64 %mul } + +define i32 @mul81_32(i32 %A) { +; X64-LABEL: mul81_32: +; X64: leal +; X64-NEXT: leal +; X86-LABEL: mul81_32: +; X86: leal +; X86-NEXT: leal + %mul = mul i32 %A, 81 + ret i32 %mul +} + +define i32 @mulmin81_32(i32 %A) { +; X64-LABEL: mulmin81_32: +; X64: leal +; X64-NEXT: leal +; X64-NEXT: negl +; X86-LABEL: mulmin81_32: +; X86: leal +; X86-NEXT: leal +; X86-NEXT: negl + %mul = mul i32 %A, -81 + ret i32 %mul +} + +define i32 @mul1920_32(i32 %A) { +; X64-LABEL: mul1920_32: +; X64: shll +; X64-NEXT: leal +; X64-NEXT: leal +; X86-LABEL: mul1920_32: +; X86: shll +; X86-NEXT: leal +; X86-NEXT: leal + %mul = mul i32 %A, 1920 + ret i32 %mul +} + +; These muls require too many instruction +define i32 @negative(i32 %A) { +; X64-LABEL: negative: +; X64: imull $-1920 +; X64: imull $625 +; X86-LABEL: negative: +; X86: imull $-1920 +; X86: imull $625 + %mul = mul i32 %A, -1920 + %mul2 = mul i32 %A, 625 + %f = add i32 %mul, %mul2 + ret i32 %f +} \ No newline at end of file