Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7534,19 +7534,45 @@ // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. + // More aggressively, some multiplications can be lowered to shift+add+shift + // if the constant is (2^N + 1) * 2^M. + // TODO: consider constants in the form of (2^N - 1 ) * 2^M, + // (2^N + 1 ) * 2^M + 1, or (2^N + 1) * (2^M + 1). if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { const APInt &Value = C->getAPIntValue(); EVT VT = N->getValueType(0); SDLoc DL(N); if (Value.isNonNegative()) { + // Lg2 is used to test if the mul can be lowered to shift+add+shift. + unsigned Lg2 = Value.countTrailingZeros(); + // Conservatively do no lower to shift+add+shift if the mul might be + // folded into smul or umul. + if (Lg2 && (isSignExtended(N->getOperand(0).getNode(), DAG) || + isZeroExtended(N->getOperand(0).getNode(), DAG))) + Lg2 = 0; + // Conservatively do no lower to shift+add+shift if the mul might be + // folded into madd or msub. + if (Lg2) + for (SDNode *Use : N->uses()) + if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { + Lg2 = 0; + break; + } + APInt Shift = Value.ashr(Lg2); // (mul x, 2^N + 1) => (add (shl x, N), x) - APInt VM1 = Value - 1; + // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) + APInt VM1 = Shift - 1; if (VM1.isPowerOf2()) { - SDValue ShiftedVal = + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); - return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, - N->getOperand(0)); + SDValue Add = + DAG.getNode(ISD::ADD, DL, VT, ShiftedValue, N->getOperand(0)); + if (Lg2) + return DAG.getNode(ISD::SHL, DL, VT, Add, + DAG.getConstant(Lg2, DL, MVT::i64)); + else + return Add; } // (mul x, 2^N - 1) => (sub (shl x, N), x) APInt VP1 = Value + 1; Index: test/CodeGen/AArch64/arm64-mul.ll =================================================================== --- test/CodeGen/AArch64/arm64-mul.ll +++ test/CodeGen/AArch64/arm64-mul.ll @@ -104,7 +104,7 @@ ; CHECK-LABEL: t10: ; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} %tmp1 = sext i32 %a to i64 - %tmp2 = mul i64 %tmp1, 2147483650 ; = 2^31 + 2 + %tmp2 = mul i64 %tmp1, 2147483650 ; = 2^31 + 2 ret i64 %tmp2 } Index: test/CodeGen/AArch64/mul_pow2.ll =================================================================== --- test/CodeGen/AArch64/mul_pow2.ll +++ test/CodeGen/AArch64/mul_pow2.ll @@ -2,6 +2,7 @@ ; Convert mul x, pow2 to shift. ; Convert mul x, pow2 +/- 1 to shift + add/sub. +; Convert mul x, (pow2 + 1) * pow2 to shift + add + shift. define i32 @test2(i32 %x) { ; CHECK-LABEL: test2 @@ -36,6 +37,15 @@ ret i32 %mul } +define i32 @test6(i32 %x) { +; CHECK-LABEL: test6 +; CHECK: add w8, w0, w0, lsl #1 +; CHECK: lsl w0, w8, #1 + + %mul = mul nsw i32 %x, 6 + ret i32 %mul +} + define i32 @test7(i32 %x) { ; CHECK-LABEL: test7 ; CHECK: lsl {{w[0-9]+}}, w0, #3 @@ -57,10 +67,18 @@ ; CHECK-LABEL: test9 ; CHECK: add w0, w0, w0, lsl #3 - %mul = mul nsw i32 %x, 9 + %mul = mul nsw i32 %x, 9 ret i32 %mul } +define i32 @test10(i32 %x) { +; CHECK-LABEL: test10 +; CHECK: add w8, w0, w0, lsl #2 +; CHECK: lsl w0, w8, #1 + + %mul = mul nsw i32 %x, 10 + ret i32 %mul +} ; Convert mul x, -pow2 to shift. ; Convert mul x, -(pow2 +/- 1) to shift + add/sub.