Index: llvm/lib/Target/AArch64/AArch64FastISel.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -4830,27 +4830,18 @@ return true; } - int64_t Pow2MinusOne = (1ULL << Lg2) - 1; - unsigned AddReg = emitAdd_ri_(VT, Src0Reg, Pow2MinusOne); - if (!AddReg) - return false; - - // (Src0 < 0) ? Pow2 - 1 : 0; - if (!emitICmp_ri(VT, Src0Reg, 0)) - return false; + unsigned BitWidth = VT.getScalarSizeInBits(); + unsigned SignReg = Src0Reg; + if (Lg2 > 1) { - unsigned SelectOpc; - const TargetRegisterClass *RC; - if (VT == MVT::i64) { - SelectOpc = AArch64::CSELXr; - RC = &AArch64::GPR64RegClass; - } else { - SelectOpc = AArch64::CSELWr; - RC = &AArch64::GPR32RegClass; + SignReg = emitASR_ri(VT, VT, Src0Reg, BitWidth - 1); + if (!SignReg) + return false; } - Register SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg, - AArch64CC::LT); - if (!SelectReg) + + unsigned AddReg = emitAddSub_rs(/*UseAdd=*/true, VT, Src0Reg, SignReg, + AArch64_AM::LSR, BitWidth - Lg2); + if (!AddReg) return false; // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also @@ -4858,10 +4849,10 @@ unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; unsigned ResultReg; if (C.isNegative()) - ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, SelectReg, + ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, AddReg, AArch64_AM::ASR, Lg2); else - ResultReg = emitASR_ri(VT, VT, SelectReg, Lg2); + ResultReg = emitASR_ri(VT, VT, AddReg, Lg2); if (!ResultReg) return false; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13536,38 +13536,7 @@ if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) return SDValue(N, 0); - // fold (sdiv X, pow2) - if ((VT != MVT::i32 && VT != MVT::i64) || - !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) - return SDValue(); - - SDLoc DL(N); - SDValue N0 = N->getOperand(0); - unsigned Lg2 = Divisor.countTrailingZeros(); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); - - // Add (N0 < 0) ? Pow2 - 1 : 0; - SDValue CCVal; - SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); - SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); - - Created.push_back(Cmp.getNode()); - Created.push_back(Add.getNode()); - Created.push_back(CSel.getNode()); - - // Divide by pow2. - SDValue SRA = - DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); - - // If we're dividing by a positive value, we're done. Otherwise, we must - // negate the result. - if (Divisor.isNonNegative()) - return SRA; - - Created.push_back(SRA.getNode()); - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); + return SDValue(); } static bool IsSVECntIntrinsic(SDValue S) { Index: llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll =================================================================== --- llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll +++ llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll @@ -14,9 +14,8 @@ define i32 @sdiv_i32_pos(i32 %a) { ; CHECK-LABEL: sdiv_i32_pos: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #7 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #29 ; CHECK-NEXT: asr w0, w8, #3 ; CHECK-NEXT: ret %1 = sdiv i32 %a, 8 @@ -26,9 +25,8 @@ define i32 @sdiv_i32_neg(i32 %a) { ; CHECK-LABEL: sdiv_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #7 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #29 ; CHECK-NEXT: neg w0, w8, asr #3 ; CHECK-NEXT: ret %1 = sdiv i32 %a, -8 @@ -47,9 +45,8 @@ define i64 @sdiv_i64_pos(i64 %a) { ; CHECK-LABEL: sdiv_i64_pos: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #15 -; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: asr x8, x0, #63 +; CHECK-NEXT: add x8, x0, x8, lsr #60 ; CHECK-NEXT: asr x0, x8, #4 ; CHECK-NEXT: ret %1 = sdiv i64 %a, 16 @@ -59,9 +56,8 @@ define i64 @sdiv_i64_neg(i64 %a) { ; CHECK-LABEL: sdiv_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #15 -; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: asr x8, x0, #63 +; CHECK-NEXT: add x8, x0, x8, lsr #60 ; CHECK-NEXT: neg x0, x8, asr #4 ; CHECK-NEXT: ret %1 = sdiv i64 %a, -16 Index: llvm/test/CodeGen/AArch64/sdivpow2.ll =================================================================== --- llvm/test/CodeGen/AArch64/sdivpow2.ll +++ llvm/test/CodeGen/AArch64/sdivpow2.ll @@ -1,13 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ISEL -; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,FAST +; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK define i32 @test1(i32 %x) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #7 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #29 ; CHECK-NEXT: asr w0, w8, #3 ; CHECK-NEXT: ret %div = sdiv i32 %x, 8 @@ -17,9 +16,8 @@ define i32 @test2(i32 %x) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #7 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #29 ; CHECK-NEXT: neg w0, w8, asr #3 ; CHECK-NEXT: ret %div = sdiv i32 %x, -8 @@ -29,9 +27,8 @@ define i32 @test3(i32 %x) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #31 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #27 ; CHECK-NEXT: asr w0, w8, #5 ; CHECK-NEXT: ret %div = sdiv i32 %x, 32 @@ -41,9 +38,8 @@ define i64 @test4(i64 %x) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #7 -; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: asr x8, x0, #63 +; CHECK-NEXT: add x8, x0, x8, lsr #61 ; CHECK-NEXT: asr x0, x8, #3 ; CHECK-NEXT: ret %div = sdiv i64 %x, 8 @@ -53,9 +49,8 @@ define i64 @test5(i64 %x) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #7 -; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: asr x8, x0, #63 +; CHECK-NEXT: add x8, x0, x8, lsr #61 ; CHECK-NEXT: neg x0, x8, asr #3 ; CHECK-NEXT: ret %div = sdiv i64 %x, -8 @@ -65,9 +60,8 @@ define i64 @test6(i64 %x) { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #63 -; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: asr x8, x0, #63 +; CHECK-NEXT: add x8, x0, x8, lsr #58 ; CHECK-NEXT: asr x0, x8, #6 ; CHECK-NEXT: ret %div = sdiv i64 %x, 64 @@ -77,10 +71,8 @@ define i64 @test7(i64 %x) { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #281474976710655 -; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: add x8, x0, x8 -; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: asr x8, x0, #63 +; CHECK-NEXT: add x8, x0, x8, lsr #16 ; CHECK-NEXT: asr x0, x8, #48 ; CHECK-NEXT: ret %div = sdiv i64 %x, 281474976710656 @@ -88,20 +80,11 @@ } define i64 @test8(i64 %x) { -; ISEL-LABEL: test8: -; ISEL: // %bb.0: -; ISEL-NEXT: cmp x0, #0 -; ISEL-NEXT: cinc x8, x0, lt -; ISEL-NEXT: asr x0, x8, #1 -; ISEL-NEXT: ret -; -; FAST-LABEL: test8: -; FAST: // %bb.0: -; FAST-NEXT: add x8, x0, #1 -; FAST-NEXT: cmp x0, #0 -; FAST-NEXT: csel x8, x8, x0, lt -; FAST-NEXT: asr x0, x8, #1 -; FAST-NEXT: ret +; CHECK-LABEL: test8: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x0, lsr #63 +; CHECK-NEXT: asr x0, x8, #1 +; CHECK-NEXT: ret %div = sdiv i64 %x, 2 ret i64 %div } Index: llvm/test/CodeGen/AArch64/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -95,9 +95,8 @@ define i32 @dont_fold_srem_power_of_two(i32 %x) { ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #63 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #26 ; CHECK-NEXT: and w8, w8, #0xffffffc0 ; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret @@ -119,10 +118,8 @@ define i32 @dont_fold_srem_i32_smax(i32 %x) { ; CHECK-LABEL: dont_fold_srem_i32_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #2147483647 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: add w8, w0, w8 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #1 ; CHECK-NEXT: and w8, w8, #0x80000000 ; CHECK-NEXT: add w0, w0, w8 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/srem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq.ll +++ llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -234,9 +234,8 @@ define i32 @test_srem_pow2(i32 %X) nounwind { ; CHECK-LABEL: test_srem_pow2: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #15 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #28 ; CHECK-NEXT: and w8, w8, #0xfffffff0 ; CHECK-NEXT: cmp w0, w8 ; CHECK-NEXT: cset w0, eq @@ -251,10 +250,8 @@ define i32 @test_srem_int_min(i32 %X) nounwind { ; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #2147483647 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: add w8, w0, w8 -; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: asr w8, w0, #31 +; CHECK-NEXT: add w8, w0, w8, lsr #1 ; CHECK-NEXT: and w8, w8, #0x80000000 ; CHECK-NEXT: cmn w0, w8 ; CHECK-NEXT: cset w0, eq Index: llvm/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -156,39 +156,33 @@ ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: smov w10, v0.h[3] +; CHECK-NEXT: smov w11, v0.h[1] ; CHECK-NEXT: mov w8, #37253 ; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: add w11, w9, #31 -; CHECK-NEXT: cmp w9, #0 -; CHECK-NEXT: add w12, w10, #63 -; CHECK-NEXT: csel w11, w11, w9, lt -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: and w11, w11, #0xffffffe0 -; CHECK-NEXT: csel w12, w12, w10, lt -; CHECK-NEXT: sub w9, w9, w11 +; CHECK-NEXT: add w12, w9, w9, lsr #26 +; CHECK-NEXT: smull x8, w10, w8 ; CHECK-NEXT: and w12, w12, #0xffffffc0 -; CHECK-NEXT: sub w10, w10, w12 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: smov w10, v0.h[2] -; CHECK-NEXT: smull x8, w12, w8 -; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: add w13, w11, w11, lsr #27 +; CHECK-NEXT: sub w9, w9, w12 +; CHECK-NEXT: and w12, w13, #0xffffffe0 +; CHECK-NEXT: smov w13, v0.h[2] ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w9, w10, #7 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: csel w9, w9, w10, lt -; CHECK-NEXT: add w8, w8, w12 +; CHECK-NEXT: sub w11, w11, w12 +; CHECK-NEXT: add w8, w8, w10 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: add w9, w13, w13, lsr #29 +; CHECK-NEXT: mov v0.h[1], w11 ; CHECK-NEXT: and w9, w9, #0xfffffff8 -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: asr w10, w8, #6 -; CHECK-NEXT: add w8, w10, w8, lsr #31 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: msub w8, w8, w10, w12 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: sub w9, w13, w9 +; CHECK-NEXT: mov w11, #95 +; CHECK-NEXT: msub w8, w8, w11, w10 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -245,35 +239,32 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[2] ; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: mov w11, #32767 +; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: mov w11, #23 ; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: add w11, w10, w11 -; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: csel w11, w11, w10, lt ; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: and w11, w11, #0xffff8000 ; CHECK-NEXT: asr w13, w9, #4 -; CHECK-NEXT: sub w10, w10, w11 -; CHECK-NEXT: mov w11, #47143 ; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #23 -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: msub w8, w9, w13, w8 -; CHECK-NEXT: smull x9, w12, w11 +; CHECK-NEXT: add w13, w10, w10, lsr #17 +; CHECK-NEXT: and w13, w13, #0xffff8000 +; CHECK-NEXT: sub w10, w10, w13 +; CHECK-NEXT: mov w13, #47143 +; CHECK-NEXT: movk w13, #24749, lsl #16 +; CHECK-NEXT: msub w8, w9, w11, w8 +; CHECK-NEXT: smull x9, w12, w13 +; CHECK-NEXT: mov v0.h[1], w10 ; CHECK-NEXT: lsr x10, x9, #63 ; CHECK-NEXT: asr x9, x9, #43 ; CHECK-NEXT: add w9, w9, w10 ; CHECK-NEXT: mov w10, #5423 -; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: msub w8, w9, w10, w12 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1