diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -96,6 +96,7 @@ SMIN_PRED, SRA_PRED, SRL_PRED, + SRAD_PRED, SUB_PRED, UDIV_PRED, UMAX_PRED, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1941,6 +1941,7 @@ MAKE_CASE(AArch64ISD::SMIN_PRED) MAKE_CASE(AArch64ISD::SRA_PRED) MAKE_CASE(AArch64ISD::SRL_PRED) + MAKE_CASE(AArch64ISD::SRAD_PRED) MAKE_CASE(AArch64ISD::SUB_PRED) MAKE_CASE(AArch64ISD::UDIV_PRED) MAKE_CASE(AArch64ISD::UMAX_PRED) @@ -2305,6 +2306,10 @@ static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG); +static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT); +static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT); /// isZerosVector - Check whether SDNode N is a zero-filled vector. static bool isZerosVector(const SDNode *N) { @@ -12930,31 +12935,61 @@ if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV - // fold (sdiv X, pow2) EVT VT = N->getValueType(0); - if ((VT != MVT::i32 && VT != MVT::i64) || + bool IsFoldableVector = + Subtarget->hasSVE() && + (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, true)); + bool IsFoldableScalar = VT == MVT::i32 || VT == MVT::i64; + + // fold (sdiv X, pow2) + if ((!IsFoldableVector && !IsFoldableScalar) || !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countTrailingZeros(); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); - // Add (N0 < 0) ? Pow2 - 1 : 0; - SDValue CCVal; - SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); - SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); + SDValue SRA; + + if (IsFoldableVector && VT.isFixedLengthVector()) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + auto Op = convertToScalableVector(DAG, ContainerVT, N0); + + auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); + auto ScalableSRA = + DAG.getNode(AArch64ISD::SRAD_PRED, DL, ContainerVT, Pg, Op, + DAG.getTargetConstant(Lg2, DL, MVT::i32)); - Created.push_back(Cmp.getNode()); - Created.push_back(Add.getNode()); - Created.push_back(CSel.getNode()); + SRA = convertFromScalableVector(DAG, VT, ScalableSRA); - // Divide by pow2. - SDValue SRA = - DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); + Created.push_back(Op.getNode()); + Created.push_back(Pg.getNode()); + Created.push_back(ScalableSRA.getNode()); + } else if (IsFoldableVector) { + SDValue Pg = getPredicateForScalableVector(DAG, DL, VT); + SRA = DAG.getNode(AArch64ISD::SRAD_PRED, DL, VT, Pg, N0, + DAG.getTargetConstant(Lg2, DL, MVT::i32)); + + Created.push_back(Pg.getNode()); + } else { + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); + + // Add (N0 < 0) ? Pow2 - 1 : 0; + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); + SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); + + Created.push_back(Cmp.getNode()); + Created.push_back(Add.getNode()); + Created.push_back(CSel.getNode()); + + // Divide by pow2. + SRA = + DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); + } // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. @@ -14994,6 +15029,9 @@ return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); case Intrinsic::aarch64_sve_asr: return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); + case Intrinsic::aarch64_sve_asrd: + return DAG.getNode(AArch64ISD::SRAD_PRED, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fadd: return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG); case Intrinsic::aarch64_sve_fsub: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -275,6 +275,13 @@ def AArch64bic : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>; +def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3,i32>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2> +]>; + +def AArch64asrd_p : SDNode<"AArch64ISD::SRAD_PRED", SDT_AArch64Arith_Imm>; + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; @@ -1575,7 +1582,7 @@ defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", AArch64asrd_p>; defm ASR_ZPZI : sve_int_shift_pred_bhsd; defm LSR_ZPZI : sve_int_shift_pred_bhsd; @@ -1586,7 +1593,7 @@ defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd; - defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; + defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; } // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos let Predicates = [HasSVEorStreamingSVE] in { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -0,0 +1,340 @@ +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 + +target triple = "aarch64-unknown-linux-gnu" + +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 { +; CHECK-LABEL: sdiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer) + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer) + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a) #0 { +; CHECK-LABEL: sdiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer) + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sdiv_v64i8(<64 x i8>* %a) #0 { +; VBITS_GE_512-LABEL: sdiv_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %res = sdiv <64 x i8> %op1, shufflevector (<64 x i8> insertelement (<64 x i8> poison, i8 32, i32 0), <64 x i8> poison, <64 x i32> zeroinitializer) + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @sdiv_v128i8(<128 x i8>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer) + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @sdiv_v256i8(<256 x i8>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v256i8: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer) + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer) + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer) + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a) #0 { +; CHECK-LABEL: sdiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer) + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sdiv_v32i16(<32 x i16>* %a) #0 { +; VBITS_GE_512-LABEL: sdiv_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %res = sdiv <32 x i16> %op1, shufflevector (<32 x i16> insertelement (<32 x i16> poison, i16 32, i32 0), <32 x i16> poison, <32 x i32> zeroinitializer) + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @sdiv_v64i16(<64 x i16>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer) + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @sdiv_v128i16(<128 x i16>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v128i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer) + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer) + ret <2 x i32> %res +} + +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer) + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: sdiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer) + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sdiv_v16i32(<16 x i32>* %a) #0 { +; VBITS_GE_512-LABEL: sdiv_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %res = sdiv <16 x i32> %op1, shufflevector (<16 x i32> insertelement (<16 x i32> poison, i32 32, i32 0), <16 x i32> poison, <16 x i32> zeroinitializer) + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @sdiv_v32i32(<32 x i32>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer) + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @sdiv_v64i32(<64 x i32>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer) + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer) + ret <1 x i64> %res +} + +; Vector i64 sdiv are not legal for NEON so use SVE when available. +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer) + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a) #0 { +; CHECK-LABEL: sdiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer) + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sdiv_v8i64(<8 x i64>* %a) #0 { +; VBITS_GE_512-LABEL: sdiv_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %res = sdiv <8 x i64> %op1, shufflevector (<8 x i64> insertelement (<8 x i64> poison, i64 32, i32 0), <8 x i64> poison, <8 x i32> zeroinitializer) + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @sdiv_v16i64(<16 x i64>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer) + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @sdiv_v32i64(<32 x i64>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer) + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @sdiv_i8( %a) #0 { +; CHECK-LABEL: sdiv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #4 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i8 16, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i8_neg( %a) #0 { +; CHECK-LABEL: sdiv_i8_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #6 +; CHECK-NEXT: subr z0.b, z0.b, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i8 -64, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i16( %a) #0 { +; CHECK-LABEL: sdiv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #10 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i16 1024, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i16_neg( %a) #0 { +; CHECK-LABEL: sdiv_i16_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #12 +; CHECK-NEXT: subr z0.h, z0.h, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i16 -4096, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i32( %a) #0 { +; CHECK-LABEL: sdiv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #23 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i32 8388608, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i32_neg( %a) #0 { +; CHECK-LABEL: sdiv_i32_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #25 +; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i32 -33554432, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i64( %a) #0 { +; CHECK-LABEL: sdiv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #53 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i64 9007199254740992, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i64_neg( %a) #0 { +; CHECK-LABEL: sdiv_i64_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #55 +; CHECK-NEXT: subr z0.d, z0.d, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i64 -36028797018963968, i32 0), poison, zeroinitializer) + ret %out +} + +attributes #0 = { "target-features"="+sve" }