Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13001,12 +13001,44 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); + // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y, + // and in MachineCombiner pass, add+mul will be combined into madd. + // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X. + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue MulOper; + unsigned Opc; + + auto IsAddSubWith1 = [&](SDValue V) -> bool { + Opc = V->getOpcode(); + if ((Opc == ISD::ADD || Opc == ISD::SUB) && V->hasOneUse()) { + SDValue Opnd = V->getOperand(1); + MulOper = V->getOperand(0); + if (Opc == ISD::SUB) + std::swap(Opnd, MulOper); + if (auto C = dyn_cast(Opnd)) + return C->isOne(); + } + return false; + }; + + if (IsAddSubWith1(N0)) { + SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper); + return DAG.getNode(Opc, DL, VT, N1, MulVal); + } + + if (IsAddSubWith1(N1)) { + SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper); + return DAG.getNode(Opc, DL, VT, N0, MulVal); + } + // The below optimizations require a constant RHS. - if (!isa(N->getOperand(1))) + if (!isa(N1)) return SDValue(); - SDValue N0 = N->getOperand(0); - ConstantSDNode *C = cast(N->getOperand(1)); + ConstantSDNode *C = cast(N1); const APInt &ConstValue = C->getAPIntValue(); // Allow the scaling to be folded into the `cnt` instruction by preventing @@ -13084,8 +13116,6 @@ return SDValue(); } - SDLoc DL(N); - EVT VT = N->getValueType(0); SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShiftAmt, DL, MVT::i64)); Index: llvm/test/CodeGen/AArch64/madd-combiner.ll =================================================================== --- llvm/test/CodeGen/AArch64/madd-combiner.ll +++ llvm/test/CodeGen/AArch64/madd-combiner.ll @@ -1,20 +1,25 @@ -; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL +; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FAST ; Test that we use the correct register class. define i32 @mul_add_imm(i32 %a, i32 %b) { -; CHECK-LABEL: mul_add_imm -; CHECK: orr [[REG:w[0-9]+]], wzr, #0x4 -; CHECK-NEXT: madd {{w[0-9]+}}, w0, w1, [[REG]] +; CHECK-LABEL: mul_add_imm: +; CHECK: ; %bb.0: +; CHECK-NEXT: orr w8, wzr, #0x4 +; CHECK-NEXT: madd w0, w0, w1, w8 +; CHECK-NEXT: ret %1 = mul i32 %a, %b %2 = add i32 %1, 4 ret i32 %2 } define i32 @mul_sub_imm1(i32 %a, i32 %b) { -; CHECK-LABEL: mul_sub_imm1 -; CHECK: mov [[REG:w[0-9]+]], #4 -; CHECK-NEXT: msub {{w[0-9]+}}, w0, w1, [[REG]] +; CHECK-LABEL: mul_sub_imm1: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: msub w0, w0, w1, w8 +; CHECK-NEXT: ret %1 = mul i32 %a, %b %2 = sub i32 4, %1 ret i32 %2 @@ -22,6 +27,29 @@ ; bugpoint reduced test case. This only tests that we pass the MI verifier. define void @mul_add_imm2() { +; CHECK-ISEL-LABEL: mul_add_imm2: +; CHECK-ISEL: ; %bb.0: ; %entry +; CHECK-ISEL-NEXT: mov w8, #1 +; CHECK-ISEL-NEXT: LBB2_1: ; %for.body8 +; CHECK-ISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-ISEL-NEXT: cbnz w8, LBB2_1 +; CHECK-ISEL-NEXT: ; %bb.2: ; %for.end20 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: mul_add_imm2: +; CHECK-FAST: ; %bb.0: ; %entry +; CHECK-FAST-NEXT: mov x8, #-3 +; CHECK-FAST-NEXT: orr x9, xzr, #0xfffffffffffffffd +; CHECK-FAST-NEXT: madd x8, x8, x8, x9 +; CHECK-FAST-NEXT: mov x9, #45968 +; CHECK-FAST-NEXT: movk x9, #48484, lsl #16 +; CHECK-FAST-NEXT: movk x9, #323, lsl #32 +; CHECK-FAST-NEXT: LBB2_1: ; %for.body8 +; CHECK-FAST-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-FAST-NEXT: cmp x8, x9 +; CHECK-FAST-NEXT: b.lt LBB2_1 +; CHECK-FAST-NEXT: ; %bb.2: ; %for.end20 +; CHECK-FAST-NEXT: ret entry: br label %for.body for.body: @@ -35,3 +63,141 @@ ret void } +define i32 @add1_mul_val1(i32 %a, i32 %b) { +; CHECK-ISEL-LABEL: add1_mul_val1: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: madd w0, w1, w0, w1 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: add1_mul_val1: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: add w8, w0, #1 +; CHECK-FAST-NEXT: mul w0, w8, w1 +; CHECK-FAST-NEXT: ret + %1 = add i32 %a, 1 + %2 = mul i32 %1, %b + ret i32 %2 +} + +define i32 @add1_mul_val2(i32 %a, i32 %b) { +; CHECK-ISEL-LABEL: add1_mul_val2: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: madd w0, w0, w1, w0 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: add1_mul_val2: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: add w8, w1, #1 +; CHECK-FAST-NEXT: mul w0, w0, w8 +; CHECK-FAST-NEXT: ret + %1 = add i32 %b, 1 + %2 = mul i32 %a, %1 + ret i32 %2 +} + +define i64 @add1_mul_val3(i64 %a, i64 %b) { +; CHECK-ISEL-LABEL: add1_mul_val3: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: madd x0, x0, x1, x0 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: add1_mul_val3: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: add x8, x1, #1 +; CHECK-FAST-NEXT: mul x0, x0, x8 +; CHECK-FAST-NEXT: ret + %1 = add i64 %b, 1 + %2 = mul i64 %a, %1 + ret i64 %2 +} + +define i64 @add1_mul_val4(i64 %a, i64 %b, i64 %c) { +; CHECK-ISEL-LABEL: add1_mul_val4: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: add x8, x0, x2 +; CHECK-ISEL-NEXT: madd x0, x8, x1, x8 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: add1_mul_val4: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: add x8, x1, #1 +; CHECK-FAST-NEXT: add x9, x0, x2 +; CHECK-FAST-NEXT: mul x0, x9, x8 +; CHECK-FAST-NEXT: ret + %1 = add i64 %a, %c + %2 = add i64 %b, 1 + %3 = mul i64 %1, %2 + ret i64 %3 +} + +define i32 @sub1_mul_val1(i32 %a, i32 %b) { +; CHECK-ISEL-LABEL: sub1_mul_val1: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: msub w0, w1, w0, w1 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: sub1_mul_val1: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: mov w8, #1 +; CHECK-FAST-NEXT: sub w8, w8, w0 +; CHECK-FAST-NEXT: mul w0, w8, w1 +; CHECK-FAST-NEXT: ret + %1 = sub i32 1, %a + %2 = mul i32 %1, %b + ret i32 %2 +} + +define i32 @sub1_mul_val2(i32 %a, i32 %b) { +; CHECK-ISEL-LABEL: sub1_mul_val2: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: msub w0, w0, w1, w0 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: sub1_mul_val2: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: mov w8, #1 +; CHECK-FAST-NEXT: sub w8, w8, w1 +; CHECK-FAST-NEXT: mul w0, w0, w8 +; CHECK-FAST-NEXT: ret + %1 = sub i32 1, %b + %2 = mul i32 %a, %1 + ret i32 %2 +} + +define i64 @sub1_mul_val3(i64 %a, i64 %b) { +; CHECK-ISEL-LABEL: sub1_mul_val3: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: msub x0, x0, x1, x0 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: sub1_mul_val3: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: mov x8, #1 +; CHECK-FAST-NEXT: sub x8, x8, x1 +; CHECK-FAST-NEXT: mul x0, x0, x8 +; CHECK-FAST-NEXT: ret + %1 = sub i64 1, %b + %2 = mul i64 %a, %1 + ret i64 %2 +} + +define i64 @sub1_mul_val4(i64 %a, i64 %b) { +; CHECK-ISEL-LABEL: sub1_mul_val4: +; CHECK-ISEL: ; %bb.0: +; CHECK-ISEL-NEXT: sub x8, x0, #1 +; CHECK-ISEL-NEXT: msub x0, x8, x1, x8 +; CHECK-ISEL-NEXT: ret +; +; CHECK-FAST-LABEL: sub1_mul_val4: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: mov x8, #1 +; CHECK-FAST-NEXT: sub x9, x0, #1 +; CHECK-FAST-NEXT: sub x8, x8, x1 +; CHECK-FAST-NEXT: mul x0, x9, x8 +; CHECK-FAST-NEXT: ret + %1 = sub i64 %a, 1 + %2 = sub i64 1, %b + %3 = mul i64 %1, %2 + ret i64 %3 +} +