Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -619,6 +619,9 @@ bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; + bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13703,6 +13703,44 @@ return IsLegal; } +// Return true if the MUL can be replaced with shift/sub/add with cheaper cost. +bool AArch64TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { + // Check integral scalar types. + if (!VT.isScalarInteger() || !C->hasOneUse()) + return false; + SDNode *Mul = *C->use_begin(); + // Block the SExt/ZExt as ADD/SUB with sxtw/zxtw has low throughput. + SDValue Mul0 = Mul->getOperand(0); + unsigned Opcode = Mul0.getOpcode(); + if (!Mul0->hasOneUse() || (Opcode == ISD::SIGN_EXTEND) || + (Opcode == ISD::ZERO_EXTEND)) + return false; + // Block INTRINSIC_WO_CHAIN as INCH/DECH don't have instructions in shl form. + if ((Opcode == ISD::TRUNCATE && + Mul0->getOperand(0)->getOpcode() == ISD::INTRINSIC_WO_CHAIN) || + Opcode == ISD::INTRINSIC_WO_CHAIN) + return false; + + if (auto *ConstNode = dyn_cast(C.getNode())) { + if (!ConstNode->getAPIntValue().isSignedIntN(64)) + return false; + + // TODO: the negative const is allowed after the improvement of + // performMulCombine. + // Decompose the MUL to two SLLI instructions and an ADD/SUB. + const APInt &Imm = ConstNode->getAPIntValue(); + unsigned Shift = Imm.countTrailingZeros(); + if (Shift < 12) { + APInt ImmS = Imm.ashr(Shift); + if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2()) + return true; + } + } + + return false; +} + // Return false to prevent folding // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine, // if the folding leads to worse code. Index: llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll +++ llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll @@ -17,6 +17,12 @@ ; Check that when two complex GEPs are used in two basic blocks, LLVM can ; eliminate the common subexpression for the second use. define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) { +; CHECK-LABEL: test_GEP_CSE: +; CHECK: lsl x8, x3, #6 +; CHECK-NEXT: mov w9, #23052 +; CHECK-NEXT: add x8, x8, x3, lsl #5 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: ldr w9, [x8, x9] %liberties = getelementptr [240 x %struct], [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3 %1 = load i32, i32* %liberties, align 4 %cmp = icmp eq i32 %1, %lib @@ -32,12 +38,6 @@ ret void } -; CHECK-LABEL: test_GEP_CSE: -; CHECK: madd -; CHECK: ldr -; CHECK-NOT: madd -; CHECK:ldr - ; CHECK-NoAA-LABEL: @test_GEP_CSE( ; CHECK-NoAA: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64 ; CHECK-NoAA: [[PTR1:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96 Index: llvm/test/CodeGen/AArch64/machine-combiner-madd.ll =================================================================== --- llvm/test/CodeGen/AArch64/machine-combiner-madd.ll +++ llvm/test/CodeGen/AArch64/machine-combiner-madd.ll @@ -13,16 +13,16 @@ ; Make sure that inst-combine fuses the multiply add in the addressing mode of ; the load. -; CHECK-LABEL: fun: -; CHECK-NOT: mul -; CHECK: madd -; CHECK-NOT: mul - %class.D = type { %class.basic_string.base, [4 x i8] } %class.basic_string.base = type <{ i64, i64, i32 }> @a = global %class.D* zeroinitializer, align 8 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) define internal void @fun() section ".text.startup" { +; CHECK-LABEL: fun: +; CHECK-NOT: mul +; CHECK: .LBB0_1: +; CHECK: lsl +; CHECK-NOT: mul entry: %tmp.i.i = alloca %class.D, align 8 %y = bitcast %class.D* %tmp.i.i to i8* Index: llvm/test/CodeGen/AArch64/mul_pow2.ll =================================================================== --- llvm/test/CodeGen/AArch64/mul_pow2.ll +++ llvm/test/CodeGen/AArch64/mul_pow2.ll @@ -71,8 +71,8 @@ define i32 @test6_32b(i32 %x) { ; CHECK-LABEL: test6_32b: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w0, lsl #1 -; CHECK-NEXT: lsl w0, w8, #1 +; CHECK-NEXT: lsl w8, w0, #2 +; CHECK-NEXT: add w0, w8, w0, lsl #1 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test6_32b: @@ -88,8 +88,8 @@ define i64 @test6_64b(i64 %x) { ; CHECK-LABEL: test6_64b: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, x0, lsl #1 -; CHECK-NEXT: lsl x0, x8, #1 +; CHECK-NEXT: lsl x8, x0, #2 +; CHECK-NEXT: add x0, x8, x0, lsl #1 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test6_64b: @@ -143,8 +143,9 @@ define i32 @test6_madd(i32 %x, i32 %y) { ; CHECK-LABEL: test6_madd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: madd w0, w0, w8, w1 +; CHECK-NEXT: lsl w8, w0, #2 +; CHECK-NEXT: add w8, w8, w0, lsl #1 +; CHECK-NEXT: add w0, w8, w1 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test6_madd: @@ -161,8 +162,9 @@ define i32 @test6_msub(i32 %x, i32 %y) { ; CHECK-LABEL: test6_msub: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: msub w0, w0, w8, w1 +; CHECK-NEXT: lsl w8, w0, #2 +; CHECK-NEXT: add w8, w8, w0, lsl #1 +; CHECK-NEXT: sub w0, w1, w8 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test6_msub: @@ -290,6 +292,25 @@ ret i64 %sub } +define i32 @mull6_sub(i32 %x) { +; CHECK-LABEL: mull6_sub: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl w8, w0, #2 +; CHECK-NEXT: add w8, w8, w0, lsl #1 +; CHECK-NEXT: sub w0, w8, #1 +; CHECK-NEXT: ret +; +; GISEL-LABEL: mull6_sub: +; GISEL: // %bb.0: +; GISEL-NEXT: mov w8, #6 +; GISEL-NEXT: mul w8, w0, w8 +; GISEL-NEXT: sub w0, w8, #1 +; GISEL-NEXT: ret + %mul = mul nsw i32 %x, 6 + %sub = add nsw i32 %mul, -1 + ret i32 %sub +} + define i32 @test7(i32 %x) { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: @@ -340,8 +361,8 @@ define i32 @test10(i32 %x) { ; CHECK-LABEL: test10: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w0, lsl #2 -; CHECK-NEXT: lsl w0, w8, #1 +; CHECK-NEXT: lsl w8, w0, #3 +; CHECK-NEXT: add w0, w8, w0, lsl #1 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test10: @@ -374,8 +395,8 @@ define i32 @test12(i32 %x) { ; CHECK-LABEL: test12: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w0, lsl #1 -; CHECK-NEXT: lsl w0, w8, #2 +; CHECK-NEXT: lsl w8, w0, #3 +; CHECK-NEXT: add w0, w8, w0, lsl #2 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test12: @@ -408,8 +429,8 @@ define i32 @test14(i32 %x) { ; CHECK-LABEL: test14: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #14 -; CHECK-NEXT: mul w0, w0, w8 +; CHECK-NEXT: lsl w8, w0, #4 +; CHECK-NEXT: sub w0, w8, w0, lsl #1 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test14: @@ -731,11 +752,11 @@ ; ; GISEL-LABEL: muladd_demand_commute: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI42_1 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI42_1] -; GISEL-NEXT: adrp x8, .LCPI42_0 +; GISEL-NEXT: adrp x8, .LCPI43_1 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI43_1] +; GISEL-NEXT: adrp x8, .LCPI43_0 ; GISEL-NEXT: mla v1.4s, v0.4s, v2.4s -; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI42_0] +; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI43_0] ; GISEL-NEXT: and v0.16b, v1.16b, v0.16b ; GISEL-NEXT: ret %m = mul <4 x i32> %x, Index: llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -23,12 +23,13 @@ define i1 @test_srem_even(i4 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: sbfx w9, w0, #0, #4 -; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: add w9, w9, w9, lsl #1 -; CHECK-NEXT: ubfx w10, w9, #7, #1 -; CHECK-NEXT: add w9, w10, w9, lsr #4 -; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: sbfx w8, w0, #0, #4 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: ubfx w9, w8, #7, #1 +; CHECK-NEXT: add w8, w9, w8, lsr #4 +; CHECK-NEXT: lsl w9, w8, #2 +; CHECK-NEXT: add w8, w9, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 ; CHECK-NEXT: and w8, w8, #0xf ; CHECK-NEXT: cmp w8, #1 ; CHECK-NEXT: cset w0, eq Index: llvm/test/CodeGen/AArch64/typepromotion-phisret.ll =================================================================== --- llvm/test/CodeGen/AArch64/typepromotion-phisret.ll +++ llvm/test/CodeGen/AArch64/typepromotion-phisret.ll @@ -220,8 +220,8 @@ define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) { ; CHECK-LABEL: promote_arg_return: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w0, lsl #1 -; CHECK-NEXT: add w8, w8, #45 +; CHECK-NEXT: add w8, w0, #15 +; CHECK-NEXT: add w8, w8, w8, lsl #1 ; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: cset w8, lo ; CHECK-NEXT: strb w8, [x2] Index: llvm/test/CodeGen/AArch64/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -5,37 +5,38 @@ ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: movk w8, #22765, lsl #16 ; CHECK-NEXT: umov w10, v0.h[1] ; CHECK-NEXT: mov w12, #16913 -; CHECK-NEXT: mov w13, #95 ; CHECK-NEXT: movk w12, #8456, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: ubfx w14, w10, #2, #14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w8, w9 -; CHECK-NEXT: umull x12, w14, w12 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: umull x8, w9, w8 +; CHECK-NEXT: ubfx w13, w10, #2, #14 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w11, w9, w8 +; CHECK-NEXT: umull x12, w13, w12 +; CHECK-NEXT: umov w13, v0.h[2] +; CHECK-NEXT: add w8, w8, w11, lsr #1 +; CHECK-NEXT: mov w11, #95 +; CHECK-NEXT: lsr w8, w8, #6 ; CHECK-NEXT: lsr x12, x12, #34 -; CHECK-NEXT: msub w8, w9, w13, w8 +; CHECK-NEXT: msub w8, w8, w11, w9 ; CHECK-NEXT: mov w9, #33437 ; CHECK-NEXT: movk w9, #21399, lsl #16 -; CHECK-NEXT: mov w13, #124 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: msub w10, w12, w13, w10 +; CHECK-NEXT: lsl w11, w12, #2 +; CHECK-NEXT: sub w11, w11, w12, lsl #7 ; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umull x9, w13, w9 +; CHECK-NEXT: add w10, w10, w11 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov w13, #2287 +; CHECK-NEXT: mov w11, #2287 ; CHECK-NEXT: lsr x8, x9, #37 ; CHECK-NEXT: mov w9, #98 -; CHECK-NEXT: movk w13, #16727, lsl #16 -; CHECK-NEXT: msub w8, w8, w9, w11 +; CHECK-NEXT: movk w11, #16727, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w13 ; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: umull x9, w12, w13 +; CHECK-NEXT: umull x9, w12, w11 ; CHECK-NEXT: mov w10, #1003 ; CHECK-NEXT: lsr x9, x9, #40 ; CHECK-NEXT: mov v0.h[2], w8 Index: llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/split-gep.ll =================================================================== --- llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/split-gep.ll +++ llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/split-gep.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O3 -mtriple=aarch64-linux-gnu | FileCheck %s %struct = type { i32, i32, i32 } @@ -5,8 +6,9 @@ define i32 @test1(%struct* %ptr, i64 %idx) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #12 -; CHECK-NEXT: madd x8, x1, x8, x0 +; CHECK-NEXT: lsl x8, x1, #3 +; CHECK-NEXT: add x8, x8, x1, lsl #2 +; CHECK-NEXT: add x8, x0, x8 ; CHECK-NEXT: ldr w9, [x8, #4] ; CHECK-NEXT: tbnz w9, #31, .LBB0_2 ; CHECK-NEXT: // %bb.1: