diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3622,19 +3622,30 @@ getShiftAmountTy(N0.getValueType())))); } - // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub. + // Try to transform: + // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub. // mul x, (2^N + 1) --> add (shl x, N), x // mul x, (2^N - 1) --> sub (shl x, N), x // Examples: x * 33 --> (x << 5) + x // x * 15 --> (x << 4) - x // x * -33 --> -((x << 5) + x) // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) + // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub. + // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M)) + // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M)) + // Examples: x * 0x8800 --> (x << 15) + (x << 11) + // x * 0xf800 --> (x << 16) - (x << 11) + // x * -0x8800 --> -((x << 15) + (x << 11)) + // x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16) if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { // TODO: We could handle more general decomposition of any constant by // having the target set a limit on number of ops and making a // callback to determine that sequence (similar to sqrt expansion). unsigned MathOp = ISD::DELETED_NODE; APInt MulC = ConstValue1.abs(); + // The constant `2` should be treated as (2^0 + 1). + unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros(); + MulC.lshrInPlace(TZeros); if ((MulC - 1).isPowerOf2()) MathOp = ISD::ADD; else if ((MulC + 1).isPowerOf2()) @@ -3643,12 +3654,17 @@ if (MathOp != ISD::DELETED_NODE) { unsigned ShAmt = MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2(); + ShAmt += TZeros; assert(ShAmt < VT.getScalarSizeInBits() && "multiply-by-constant generated out of bounds shift"); SDLoc DL(N); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT)); - SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0); + SDValue R = + TZeros ? DAG.getNode(MathOp, DL, VT, Shl, + DAG.getNode(ISD::SHL, DL, VT, N0, + DAG.getConstant(TZeros, DL, VT))) + : DAG.getNode(MathOp, DL, VT, Shl, N0); if (ConstValue1.isNegative()) R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R); return R; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -931,6 +931,9 @@ return true; } + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; + bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override { // Only handle float load/store pair because float(fpr) load/store diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -16057,6 +16057,32 @@ return true; } +bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { + // Check integral scalar types. + if (!VT.isScalarInteger()) + return false; + if (auto *ConstNode = dyn_cast(C.getNode())) { + if (!ConstNode->getAPIntValue().isSignedIntN(64)) + return false; + // This transformation will generate >= 2 operations. But the following + // cases will generate <= 2 instructions during ISEL. So exclude them. + // 1. If the constant multiplier fits 16 bits, it can be handled by one + // HW instruction, ie. MULLI + // 2. If the multiplier after shifted fits 16 bits, an extra shift + // instruction is needed than case 1, ie. MULLI and RLDICR + int64_t Imm = ConstNode->getSExtValue(); + unsigned Shift = countTrailingZeros(Imm); + Imm >>= Shift; + if (isInt<16>(Imm)) + return false; + if (isPowerOf2_64(Imm + 1) || isPowerOf2_64(Imm - 1) || + isPowerOf2_64(1 - Imm) || isPowerOf2_64(-1 - Imm)) + return true; + } + return false; +} + bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { return isFMAFasterThanFMulAndFAdd( diff --git a/llvm/test/CodeGen/PowerPC/mulli.ll b/llvm/test/CodeGen/PowerPC/mulli.ll --- a/llvm/test/CodeGen/PowerPC/mulli.ll +++ b/llvm/test/CodeGen/PowerPC/mulli.ll @@ -48,10 +48,9 @@ define i64 @test5(i64 %x) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: lis 4, 16 -; CHECK-NEXT: ori 4, 4, 1 -; CHECK-NEXT: sldi 4, 4, 12 -; CHECK-NEXT: mulld 3, 3, 4 +; CHECK-NEXT: sldi 4, 3, 12 +; CHECK-NEXT: sldi 3, 3, 32 +; CHECK-NEXT: add 3, 3, 4 ; CHECK-NEXT: blr %y = mul i64 %x, 4294971392 ret i64 %y @@ -60,10 +59,10 @@ define i64 @test6(i64 %x) { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: -; CHECK-NEXT: lis 4, -17 -; CHECK-NEXT: ori 4, 4, 65535 -; CHECK-NEXT: sldi 4, 4, 12 -; CHECK-NEXT: mulld 3, 3, 4 +; CHECK-NEXT: sldi 4, 3, 12 +; CHECK-NEXT: sldi 3, 3, 32 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: neg 3, 3 ; CHECK-NEXT: blr %y = mul i64 %x, -4294971392 ret i64 %y @@ -72,10 +71,9 @@ define i64 @test7(i64 %x) { ; CHECK-LABEL: test7: ; CHECK: # %bb.0: -; CHECK-NEXT: lis 4, 31 -; CHECK-NEXT: ori 4, 4, 65535 -; CHECK-NEXT: sldi 4, 4, 13 -; CHECK-NEXT: mulld 3, 3, 4 +; CHECK-NEXT: sldi 4, 3, 34 +; CHECK-NEXT: sldi 3, 3, 13 +; CHECK-NEXT: sub 3, 4, 3 ; CHECK-NEXT: blr %y = mul i64 %x, 17179860992 ret i64 %y @@ -84,10 +82,9 @@ define i64 @test8(i64 %x) { ; CHECK-LABEL: test8: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, -4 -; CHECK-NEXT: sldi 4, 4, 32 -; CHECK-NEXT: ori 4, 4, 8192 -; CHECK-NEXT: mulld 3, 3, 4 +; CHECK-NEXT: sldi 4, 3, 13 +; CHECK-NEXT: sldi 3, 3, 34 +; CHECK-NEXT: sub 3, 4, 3 ; CHECK-NEXT: blr %y = mul i64 %x, -17179860992 ret i64 %y @@ -96,12 +93,11 @@ define i64 @test9(i64 %x) { ; CHECK-LABEL: test9: ; CHECK: # %bb.0: -; CHECK-NEXT: lis 4, 16 +; CHECK-NEXT: sldi 4, 3, 12 +; CHECK-NEXT: sldi 5, 3, 32 +; CHECK-NEXT: add 4, 5, 4 ; CHECK-NEXT: li 5, 8193 -; CHECK-NEXT: ori 4, 4, 1 ; CHECK-NEXT: sldi 5, 5, 19 -; CHECK-NEXT: sldi 4, 4, 12 -; CHECK-NEXT: mulld 4, 3, 4 ; CHECK-NEXT: mulld 3, 3, 5 ; CHECK-NEXT: sub 3, 4, 3 ; CHECK-NEXT: blr @@ -114,13 +110,8 @@ define i64 @test10(i64 %x) { ; CHECK-LABEL: test10: ; CHECK: # %bb.0: -; CHECK-NEXT: lis 4, 31 -; CHECK-NEXT: lis 5, 16383 -; CHECK-NEXT: ori 4, 4, 65535 -; CHECK-NEXT: ori 5, 5, 57344 -; CHECK-NEXT: sldi 4, 4, 13 -; CHECK-NEXT: mulld 4, 3, 4 -; CHECK-NEXT: mulld 3, 3, 5 +; CHECK-NEXT: sldi 4, 3, 34 +; CHECK-NEXT: sldi 3, 3, 30 ; CHECK-NEXT: sub 3, 4, 3 ; CHECK-NEXT: blr %y = mul i64 %x, 17179860992