diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10963,6 +10963,28 @@ return SDValue(); } +static bool isPow2Splat(SDValue Op, int64_t &SplatVal, bool &Negated) { + if (Op.getOpcode() != AArch64ISD::DUP && Op.getOpcode() != ISD::SPLAT_VECTOR) + return false; + + if (!isa(Op->getOperand(0))) + return false; + + SplatVal = Op->getConstantOperandVal(0); + if (Op.getValueType().getVectorElementType() != MVT::i64) + SplatVal = (int32_t)SplatVal; + + Negated = false; + if (isPowerOf2_64(SplatVal)) + return true; + + Negated = true; + if (isPowerOf2_64(-SplatVal)) + return true; + + return false; +} + SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -10977,6 +10999,11 @@ if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) return LowerToPredicatedOp(Op, DAG, PredOpcode); + bool Negated; + int64_t SplatVal; + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) + return LowerToPredicatedOp(Op, DAG, PredOpcode); + // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit // operations, and truncate the result. EVT WidenedVT; @@ -12961,13 +12988,42 @@ if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV - // fold (sdiv X, pow2) EVT VT = N->getValueType(0); + SDLoc DL(N); + + // FIXME: We must handle NEON sized vectors here since the NEON mov (imm) + // expansion gets in the way. Ideally that expansion should be pushed into + // instruction selection using splat_vector, at which point, this can be + // removed. + if (Subtarget->hasSVE() && (VT.is128BitVector() || VT.is64BitVector())) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + auto ConstType = + VT.getVectorElementType() == MVT::i64 ? MVT::i64 : MVT::i32; + auto Op = convertToScalableVector(DAG, ContainerVT, N->getOperand(0)); + auto Splat = DAG.getSplatVector( + ContainerVT, DL, + DAG.getTargetConstant(Divisor.getSExtValue(), DL, ConstType)); + auto SDiv = DAG.getNode(ISD::SDIV, DL, ContainerVT, Op, Splat); + auto Result = convertFromScalableVector(DAG, VT, SDiv); + + Created.push_back(Op.getNode()); + Created.push_back(Splat.getNode()); + Created.push_back(SDiv.getNode()); + + return Result; + } + + // For scalable and larger than NEON types, mark them as cheap so we can + // handle it much later. This allows us to handle larger than legal fixed + // types, as well sdiv intrinsics. + if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) + return SDValue(N, 0); + + // fold (sdiv X, pow2) if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) return SDValue(); - SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countTrailingZeros(); SDValue Zero = DAG.getConstant(0, DL, VT); @@ -13519,6 +13575,40 @@ DAG.getConstant(C, DL, MVT::i32)); } +static SDValue performSDivPredCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + bool IsIntrinsic = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN; + + SDValue Op = N->getOperand(IsIntrinsic ? 3 : 2); + + bool Negated; + int64_t SplatVal; + if (!isPow2Splat(Op, SplatVal, Negated) || !Negated) + return SDValue(); + + MVT ConstType = VT.getVectorElementType() == MVT::i64 ? MVT::i64 : MVT::i32; + + SDValue Pow2 = DAG.getConstant(-SplatVal, DL, ConstType); + SDValue Splat = DAG.getSplatVector(VT, DL, Pow2); + SDValue SDiv = + IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, N->getOperand(0), + N->getOperand(1), N->getOperand(2), Splat) + : DAG.getNode(AArch64ISD::SDIV_PRED, DL, VT, N->getOperand(0), + N->getOperand(1), Splat); + SDValue ZeroSplat = + DAG.getSplatVector(VT, DL, DAG.getConstant(0, DL, ConstType)); + + if (IsIntrinsic) + return DAG.getNode(AArch64ISD::SUB_PRED, DL, VT, N->getOperand(1), + ZeroSplat, SDiv); + + // SDIV_PRED has inactive lanes as undef, hence it is safe to ignore predicate + return DAG.getNode(ISD::SUB, DL, VT, ZeroSplat, SDiv); +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -15174,6 +15264,8 @@ case Intrinsic::aarch64_sve_ptest_last: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::LAST_ACTIVE); + case Intrinsic::aarch64_sve_sdiv: + return performSDivPredCombine(N, DAG); } return SDValue(); } @@ -17303,6 +17395,8 @@ return performUzpCombine(N, DAG); case AArch64ISD::SETCC_MERGE_ZERO: return performSetccMergeZeroCombine(N, DAG); + case AArch64ISD::SDIV_PRED: + return performSDivPredCombine(N, DAG); case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: @@ -18710,6 +18804,11 @@ if (EltVT == MVT::i32 || EltVT == MVT::i64) return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); + bool Negated; + int64_t SplatVal; + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) + return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); + // Scalable vector i8/i16 DIV is not supported. Promote it to i32. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1575,11 +1575,12 @@ defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd, int_aarch64_sve_sdiv>; - defm ASR_ZPZI : sve_int_shift_pred_bhsd; - defm LSR_ZPZI : sve_int_shift_pred_bhsd; - defm LSL_ZPZI : sve_int_shift_pred_bhsd; + defm ASR_ZPZI : sve_int_shift_pred_bhsd; + defm LSR_ZPZI : sve_int_shift_pred_bhsd; + defm LSL_ZPZI : sve_int_shift_pred_bhsd; + defm ASRD_ZPZI : sve_int_shift_imm_pred_bhsd; } // End HasSVEorStreamingSVE let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -155,6 +155,25 @@ let ParserMatchClass = SVELogicalImmNotOperand<32>; } +def IsPow2_32: PatLeaf<(i32 imm), [{ + uint32_t V = N->getZExtValue(); + return isPowerOf2_32(V); +}]>; + +def Log2_32: SDNodeXFormgetZExtValue(); + return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32); +}]>; +def IsPow2_64: PatLeaf<(i64 imm), [{ + uint64_t V = N->getZExtValue(); + return isPowerOf2_64(V); +}]>; + +def Log2_64: SDNodeXFormgetZExtValue(); + return CurDAG->getTargetConstant(Log2_64(V), SDLoc(N), MVT::i32); +}]>; + class SVEShiftedImmOperand : AsmOperandClass { let Name = "SVE" # Infix # "Imm" # ElementWidth; @@ -490,6 +509,13 @@ : Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), (inst $Rn, i32:$imm)>; +class SVE_Shift_DupImm_Div_Pat + : Pat<(vt (op pt:$Pg, vt:$Op, (vt (AArch64dup (it (pow:$V)))))), + (inst $Pg, $Op, (log $V))>; + class SVE_2_Op_Fp_Imm_Pat opc, string asm, string Ps, - SDPatternOperator op = null_frag> { + SDPatternOperator op = null_frag, + SDPatternOperator divOp = null_frag> { def _B : SVEPseudo2Instr, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : SVEPseudo2Instr, @@ -5365,12 +5392,17 @@ def : SVE_3_Op_Imm_Pat(NAME # _H)>; def : SVE_3_Op_Imm_Pat(NAME # _S)>; def : SVE_3_Op_Imm_Pat(NAME # _D)>; + + def : SVE_Shift_DupImm_Div_Pat(NAME # _B)>; + def : SVE_Shift_DupImm_Div_Pat(NAME # _H)>; + def : SVE_Shift_DupImm_Div_Pat(NAME # _S)>; + def : SVE_Shift_DupImm_Div_Pat(NAME # _D)>; } // As above but shift amount takes the form of a "vector immediate". multiclass sve_int_bin_pred_shift_imm_right_dup opc, string asm, string Ps, SDPatternOperator op> -: sve_int_bin_pred_shift_imm_right { +: sve_int_bin_pred_shift_imm_right { def : SVE_Shift_DupImm_Pred_Pat(NAME # _B)>; def : SVE_Shift_DupImm_Pred_Pat(NAME # _H)>; def : SVE_Shift_DupImm_Pred_Pat(NAME # _S)>; @@ -8432,6 +8464,18 @@ def : SVE_Shift_DupImm_Pred_Pat(NAME # _UNDEF_D)>; } +multiclass sve_int_shift_imm_pred_bhsd { + def _UNDEF_B : PredTwoOpImmPseudo, FalseLanesUndef>; + def _UNDEF_H : PredTwoOpImmPseudo, FalseLanesUndef>; + def _UNDEF_S : PredTwoOpImmPseudo, FalseLanesUndef>; + def _UNDEF_D : PredTwoOpImmPseudo, FalseLanesUndef>; + + def : SVE_Shift_DupImm_Div_Pat(NAME # _UNDEF_B)>; + def : SVE_Shift_DupImm_Div_Pat(NAME # _UNDEF_H)>; + def : SVE_Shift_DupImm_Div_Pat(NAME # _UNDEF_S)>; + def : SVE_Shift_DupImm_Div_Pat(NAME # _UNDEF_D)>; +} + multiclass sve_int_bin_pred_all_active_bhsd { def _UNDEF_B : PredTwoOpPseudo; def _UNDEF_H : PredTwoOpPseudo; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -0,0 +1,389 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 + +target triple = "aarch64-unknown-linux-gnu" + +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 { +; CHECK-LABEL: sdiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer) + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer) + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a) #0 { +; CHECK-LABEL: sdiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer) + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sdiv_v64i8(<64 x i8>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v64i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_EQ_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %res = sdiv <64 x i8> %op1, shufflevector (<64 x i8> insertelement (<64 x i8> poison, i8 32, i32 0), <64 x i8> poison, <64 x i32> zeroinitializer) + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @sdiv_v128i8(<128 x i8>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer) + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @sdiv_v256i8(<256 x i8>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v256i8: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer) + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer) + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer) + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a) #0 { +; CHECK-LABEL: sdiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer) + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sdiv_v32i16(<32 x i16>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_EQ_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %res = sdiv <32 x i16> %op1, shufflevector (<32 x i16> insertelement (<32 x i16> poison, i16 32, i32 0), <32 x i16> poison, <32 x i32> zeroinitializer) + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @sdiv_v64i16(<64 x i16>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer) + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @sdiv_v128i16(<128 x i16>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v128i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer) + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer) + ret <2 x i32> %res +} + +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer) + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: sdiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer) + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sdiv_v16i32(<16 x i32>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_EQ_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %res = sdiv <16 x i32> %op1, shufflevector (<16 x i32> insertelement (<16 x i32> poison, i32 32, i32 0), <16 x i32> poison, <16 x i32> zeroinitializer) + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @sdiv_v32i32(<32 x i32>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer) + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @sdiv_v64i32(<64 x i32>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer) + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer) + ret <1 x i64> %res +} + +; Vector i64 sdiv are not legal for NEON so use SVE when available. +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer) + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a) #0 { +; CHECK-LABEL: sdiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer) + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sdiv_v8i64(<8 x i64>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v8i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_EQ_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %res = sdiv <8 x i64> %op1, shufflevector (<8 x i64> insertelement (<8 x i64> poison, i64 32, i32 0), <8 x i64> poison, <8 x i32> zeroinitializer) + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @sdiv_v16i64(<16 x i64>* %a) #0 { +; VBITS_GE_1024-LABEL: sdiv_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer) + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @sdiv_v32i64(<32 x i64>* %a) #0 { +; VBITS_GE_2048-LABEL: sdiv_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer) + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-sdiv-pow2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-sdiv-pow2.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @sdiv_i32( %a, %pg) #0 { +; CHECK-LABEL: sdiv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #23 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.sdiv.nxv4i32( %pg, %a, shufflevector ( insertelement ( poison, i32 8388608, i32 0), poison, zeroinitializer)) + ret %out +} + +define @sdiv_i32_neg( %a, %pg) #0 { +; CHECK-LABEL: sdiv_i32_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #25 +; CHECK-NEXT: subr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.sdiv.nxv4i32( %pg, %a, shufflevector ( insertelement ( poison, i32 -33554432, i32 0), poison, zeroinitializer)) + ret %out +} + +define @sdiv_i64( %a, %pg) #0 { +; CHECK-LABEL: sdiv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #53 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.sdiv.nxv2i64( %pg, %a, shufflevector ( insertelement ( poison, i64 9007199254740992, i32 0), poison, zeroinitializer)) + ret %out +} + +define @sdiv_i64_neg( %a, %pg) #0 { +; CHECK-LABEL: sdiv_i64_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #55 +; CHECK-NEXT: subr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.sdiv.nxv2i64( %pg, %a, shufflevector ( insertelement ( poison, i64 -36028797018963968, i32 0), poison, zeroinitializer)) + ret %out +} + +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare @llvm.aarch64.sve.sdiv.nxv4i32(, , ) +declare @llvm.aarch64.sve.sdiv.nxv2i64(, , ) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @sdiv_i8( %a) #0 { +; CHECK-LABEL: sdiv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #4 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i8 16, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i8_neg( %a) #0 { +; CHECK-LABEL: sdiv_i8_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #6 +; CHECK-NEXT: subr z0.b, z0.b, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i8 -64, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i16( %a) #0 { +; CHECK-LABEL: sdiv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #10 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i16 1024, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i16_neg( %a) #0 { +; CHECK-LABEL: sdiv_i16_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #12 +; CHECK-NEXT: subr z0.h, z0.h, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i16 -4096, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i32( %a) #0 { +; CHECK-LABEL: sdiv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #23 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i32 8388608, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i32_neg( %a) #0 { +; CHECK-LABEL: sdiv_i32_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #25 +; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i32 -33554432, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i64( %a) #0 { +; CHECK-LABEL: sdiv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #53 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i64 9007199254740992, i32 0), poison, zeroinitializer) + ret %out +} + +define @sdiv_i64_neg( %a) #0 { +; CHECK-LABEL: sdiv_i64_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #55 +; CHECK-NEXT: subr z0.d, z0.d, #0 // =0x0 +; CHECK-NEXT: ret + %out = sdiv %a, shufflevector ( insertelement ( poison, i64 -36028797018963968, i32 0), poison, zeroinitializer) + ret %out +} + +attributes #0 = { "target-features"="+sve" }