Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -486,6 +486,12 @@ const unsigned RoundingBitsPos = 22; } // namespace AArch64 +enum class NeonOverride { + Never, + OnlyIfSVEGreaterThan128Bits, + Always, +}; + class AArch64Subtarget; class AArch64TargetLowering : public TargetLowering { @@ -971,8 +977,8 @@ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, - bool OverrideNEON = false) const; + SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOp) const; SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; @@ -1123,7 +1129,9 @@ // Normally SVE is only used for byte size vectors that do not fit within a // NEON vector. This changes when OverrideNEON is true, allowing SVE to be // used for 64bit and 128bit vectors as well. - bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; + bool + useSVEForFixedLengthVectorVT(EVT VT, + NeonOverride NO = NeonOverride::Never) const; // With the exception of data-predicate transitions, no instructions are // required to cast between legal scalable vector types. However: Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1336,6 +1336,13 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + // NEON doesn't support integer divides, but SVE does + for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v1i64, MVT::v2i64}) { + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + } + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1368,26 +1375,10 @@ setOperationAction(ISD::MULHS, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v1i64, Custom); setOperationAction(ISD::MULHU, MVT::v2i64, Custom); - setOperationAction(ISD::SDIV, MVT::v8i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i8, Custom); - setOperationAction(ISD::SDIV, MVT::v4i16, Custom); - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v2i32, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); - setOperationAction(ISD::SDIV, MVT::v1i64, Custom); - setOperationAction(ISD::SDIV, MVT::v2i64, Custom); setOperationAction(ISD::SMAX, MVT::v1i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Custom); setOperationAction(ISD::SMIN, MVT::v1i64, Custom); setOperationAction(ISD::SMIN, MVT::v2i64, Custom); - setOperationAction(ISD::UDIV, MVT::v8i8, Custom); - setOperationAction(ISD::UDIV, MVT::v16i8, Custom); - setOperationAction(ISD::UDIV, MVT::v4i16, Custom); - setOperationAction(ISD::UDIV, MVT::v8i16, Custom); - setOperationAction(ISD::UDIV, MVT::v2i32, Custom); - setOperationAction(ISD::UDIV, MVT::v4i32, Custom); - setOperationAction(ISD::UDIV, MVT::v1i64, Custom); - setOperationAction(ISD::UDIV, MVT::v2i64, Custom); setOperationAction(ISD::UMAX, MVT::v1i64, Custom); setOperationAction(ISD::UMAX, MVT::v2i64, Custom); setOperationAction(ISD::UMIN, MVT::v1i64, Custom); @@ -3921,8 +3912,11 @@ // If SVE is available then i64 vector multiplications can also be made legal. bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT( + VT, OverrideNEON ? NeonOverride::OnlyIfSVEGreaterThan128Bits + : NeonOverride::Never)) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. @@ -4364,8 +4358,8 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return ExtVal.getValueType().isScalableVector() || - useSVEForFixedLengthVectorVT(ExtVal.getValueType(), - /*OverrideNEON=*/true); + useSVEForFixedLengthVectorVT( + ExtVal.getValueType(), NeonOverride::OnlyIfSVEGreaterThan128Bits); } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { @@ -4705,7 +4699,8 @@ assert(LoadNode && "Expected custom lowering of a masked load node"); EVT VT = Op->getValueType(0); - if (useSVEForFixedLengthVectorVT(VT, true)) + if (useSVEForFixedLengthVectorVT(VT, + NeonOverride::OnlyIfSVEGreaterThan128Bits)) return LowerFixedLengthVectorMLoadToSVE(Op, DAG); SDValue PassThru = LoadNode->getPassThru(); @@ -4772,7 +4767,8 @@ EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { - if (useSVEForFixedLengthVectorVT(VT, true)) + if (useSVEForFixedLengthVectorVT(VT, + NeonOverride::OnlyIfSVEGreaterThan128Bits)) return LowerFixedLengthVectorStoreToSVE(Op, DAG); unsigned AS = StoreNode->getAddressSpace(); @@ -5090,11 +5086,9 @@ case ISD::MUL: return LowerMUL(Op, DAG); case ISD::MULHS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); case ISD::MULHU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::ATOMIC_STORE: @@ -5179,8 +5173,7 @@ case ISD::BSWAP: return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); case ISD::CTLZ: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::VECTOR_SPLICE: @@ -5193,10 +5186,7 @@ } bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( - EVT VT, bool OverrideNEON) const { - if (!Subtarget->useSVEForFixedLengthVectors()) - return false; - + EVT VT, NeonOverride NO) const { if (!VT.isFixedLengthVector()) return false; @@ -5218,7 +5208,15 @@ } // All SVE implementations support NEON sized vectors. - if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) + bool isNeonSizedVector = VT.is128BitVector() || VT.is64BitVector(); + if (isNeonSizedVector && NO == NeonOverride::Always && Subtarget->hasSVE()) + return true; + + if (!Subtarget->useSVEForFixedLengthVectors()) + return false; + + // All SVE implementations support NEON sized vectors. + if (isNeonSizedVector && NO != NeonOverride::Never) return true; // Ensure NEON MVTs only belong to a single register class. @@ -7398,7 +7396,8 @@ SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)); + useSVEForFixedLengthVectorVT( + VT, NeonOverride::OnlyIfSVEGreaterThan128Bits)); SDLoc DL(Op); SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); @@ -7430,22 +7429,19 @@ } if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) { + useSVEForFixedLengthVectorVT(VT, + NeonOverride::OnlyIfSVEGreaterThan128Bits)) { switch (Opcode) { default: llvm_unreachable("Wrong instruction"); case ISD::SMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); case ISD::SMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); case ISD::UMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); case ISD::UMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); } } @@ -7460,9 +7456,9 @@ EVT VT = Op.getValueType(); if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, - true); + useSVEForFixedLengthVectorVT(VT, + NeonOverride::OnlyIfSVEGreaterThan128Bits)) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); SDLoc DL(Op); SDValue REVB; @@ -11087,7 +11083,7 @@ EVT VT = Op.getValueType(); SDLoc dl(Op); - if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) + if (useSVEForFixedLengthVectorVT(VT, NeonOverride::Always)) return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); assert(VT.isScalableVector() && "Expected a scalable vector."); @@ -11484,7 +11480,9 @@ (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); if (SrcVT.isScalableVector() || - useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + useSVEForFixedLengthVectorVT( + SrcVT, OverrideNEON ? NeonOverride::OnlyIfSVEGreaterThan128Bits + : NeonOverride::Never)) { if (SrcVT.getVectorElementType() == MVT::i1) return LowerPredReductionToSVE(Op, DAG); @@ -19028,7 +19026,7 @@ // Scalable vector i32/i64 DIV is supported. if (EltVT == MVT::i32 || EltVT == MVT::i64) - return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, PredOpcode); // Scalable vector i8/i16 DIV is not supported. Promote it to i32. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -19183,13 +19181,14 @@ // NOTE: The results for inactive lanes are undefined. SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, - unsigned NewOp, - bool OverrideNEON) const { + unsigned NewOp) const { EVT VT = Op.getValueType(); SDLoc DL(Op); auto Pg = getPredicateForVector(DAG, DL, VT); - if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) { + if (VT.isFixedLengthVector()) { + assert(useSVEForFixedLengthVectorVT(VT, NeonOverride::Always) && + "Cannot use SVE to lower fixed length predicated op!"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); // Create list of operands by converting existing ones to scalable types. @@ -19207,7 +19206,8 @@ continue; } - assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) && + assert(useSVEForFixedLengthVectorVT(V.getValueType(), + NeonOverride::Always) && "Only fixed length vectors are supported!"); Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); } @@ -19334,7 +19334,8 @@ SDValue VecOp = ScalarOp.getOperand(0); EVT SrcVT = VecOp.getValueType(); - if (useSVEForFixedLengthVectorVT(SrcVT, true)) { + if (useSVEForFixedLengthVectorVT(SrcVT, + NeonOverride::OnlyIfSVEGreaterThan128Bits)) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); } Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-div-128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-div-128.ll @@ -0,0 +1,527 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SDIV +; + +; Vector vXi8 sdiv are not legal for NEON so use SVE when available. +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret + %res = sdiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpkhi z3.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sdiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: sunpkhi z4.h, z0.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpkhi z16.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpkhi z5.h, z1.b +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpkhi z7.s, z5.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sunpkhi z1.h, z3.b +; CHECK-NEXT: sunpkhi z6.h, z2.b +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: sunpkhi z7.s, z1.h +; CHECK-NEXT: sunpkhi z16.s, z6.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sdiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +; Vector vXi16 sdiv are not legal for NEON so use SVE when available. +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sdiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z6.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sdiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +; Vector v2i32 sdiv are not legal for NEON so use SVE when available. +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +; Vector v4i32 sdiv are not legal for NEON so use SVE when available. +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sdiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sdiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +; Vector i64 sdiv are not legal for NEON so use SVE when available. +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +; Vector i64 sdiv are not legal for NEON so use SVE when available. +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sdiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sdiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UDIV +; + +; Vector vXi8 udiv are not legal for NEON so use SVE when available. +define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret + %res = udiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: udiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: uunpkhi z4.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpkhi z16.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z5.h, z1.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpkhi z7.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uunpkhi z1.h, z3.b +; CHECK-NEXT: uunpkhi z6.h, z2.b +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: uunpkhi z7.s, z1.h +; CHECK-NEXT: uunpkhi z16.s, z6.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = udiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +; Vector vXi16 udiv are not legal for NEON so use SVE when available. +define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %res = udiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: udiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z6.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = udiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +; Vector v2i32 udiv are not legal for NEON so use SVE when available. +define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +; Vector v4i32 udiv are not legal for NEON so use SVE when available. +define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: udiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = udiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +; Vector i64 udiv are not legal for NEON so use SVE when available. +define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +; Vector i64 udiv are not legal for NEON so use SVE when available. +define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: udiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = udiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve" minsize } Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -1,4 +1,3 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512 @@ -17,9 +16,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; SDIV ; Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem-128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem-128.ll @@ -0,0 +1,628 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SREM +; + +; Vector vXi8 sdiv are not legal for NEON so use SVE when available. +define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: srem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v2.8h, v1.8b, #0 +; CHECK-NEXT: sshll v3.8h, v0.8b, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: xtn v2.8b, v2.8h +; CHECK-NEXT: mls v0.8b, v2.8b, v1.8b +; CHECK-NEXT: ret + %res = srem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: srem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls v0.16b, v2.16b, v1.16b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: srem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: sunpklo z7.h, z0.b +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpkhi z4.h, z1.b +; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: sunpkhi z16.s, z4.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpkhi z18.s, z6.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: sunpkhi z6.h, z3.b +; CHECK-NEXT: sunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: sunpkhi z16.s, z6.h +; CHECK-NEXT: sunpkhi z17.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sunpklo z7.h, z3.b +; CHECK-NEXT: sunpklo z17.h, z2.b +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpkhi z19.s, z17.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z17.s, z17.h +; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z17.s +; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z7.b, z6.b +; CHECK-NEXT: mls v2.16b, v5.16b, v3.16b +; CHECK-NEXT: mls v0.16b, v4.16b, v1.16b +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = srem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +; Vector vXi16 sdiv are not legal for NEON so use SVE when available. +define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: srem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v2.4s, v1.4h, #0 +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret + %res = srem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: srem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: srem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z17.s, z2.h +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z7.s, z0.h +; CHECK-NEXT: sunpkhi z16.s, z3.h +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEXT: sunpklo z6.s, z1.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sunpklo z7.s, z2.h +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h +; CHECK-NEXT: mls v2.8h, v5.8h, v3.8h +; CHECK-NEXT: mls v0.8h, v4.8h, v1.8h +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = srem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +; Vector v2i32 sdiv are not legal for NEON so use SVE when available. +define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: srem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +; Vector v4i32 sdiv are not legal for NEON so use SVE when available. +define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: srem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: srem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: mls v0.4s, v5.4s, v2.4s +; CHECK-NEXT: mls v1.4s, v4.4s, v3.4s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = srem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +; Vector i64 sdiv are not legal for NEON so use SVE when available. +define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: srem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: sub d0, d0, d1 +; CHECK-NEXT: ret + %res = srem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +; Vector i64 sdiv are not legal for NEON so use SVE when available. +define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: srem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mul x8, x8, x11 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %res = srem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: srem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z3.d +; CHECK-NEXT: fmov x12, d4 +; CHECK-NEXT: fmov x13, d3 +; CHECK-NEXT: fmov x14, d0 +; CHECK-NEXT: mov x10, v3.d[1] +; CHECK-NEXT: mov x11, v4.d[1] +; CHECK-NEXT: mul x9, x12, x9 +; CHECK-NEXT: mov x15, v0.d[1] +; CHECK-NEXT: mul x12, x14, x13 +; CHECK-NEXT: mul x8, x11, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: mul x10, x15, x10 +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v3.d[1], x10 +; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: sub v1.2d, v2.2d, v3.2d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = srem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UREM +; + +; Vector vXi8 udiv are not legal for NEON so use SVE when available. +define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: urem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v2.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v0.8b, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: xtn v2.8b, v2.8h +; CHECK-NEXT: mls v0.8b, v2.8b, v1.8b +; CHECK-NEXT: ret + %res = urem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: urem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.h, z1.b +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls v0.16b, v2.16b, v1.16b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: urem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: uunpklo z7.h, z0.b +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z4.h, z1.b +; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: uunpkhi z16.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpkhi z18.s, z6.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: uunpkhi z6.h, z3.b +; CHECK-NEXT: uunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: uunpkhi z16.s, z6.h +; CHECK-NEXT: uunpkhi z17.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uunpklo z7.h, z3.b +; CHECK-NEXT: uunpklo z17.h, z2.b +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpkhi z19.s, z17.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z17.s, z17.h +; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z17.s +; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z7.b, z6.b +; CHECK-NEXT: mls v2.16b, v5.16b, v3.16b +; CHECK-NEXT: mls v0.16b, v4.16b, v1.16b +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = urem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +; Vector vXi16 udiv are not legal for NEON so use SVE when available. +define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: urem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v2.4s, v1.4h, #0 +; CHECK-NEXT: ushll v3.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret + %res = urem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: urem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: urem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z17.s, z2.h +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z7.s, z0.h +; CHECK-NEXT: uunpkhi z16.s, z3.h +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEXT: uunpklo z6.s, z1.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uunpklo z7.s, z2.h +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h +; CHECK-NEXT: mls v2.8h, v5.8h, v3.8h +; CHECK-NEXT: mls v0.8h, v4.8h, v1.8h +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = urem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +; Vector v2i32 udiv are not legal for NEON so use SVE when available. +define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: urem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +; Vector v4i32 udiv are not legal for NEON so use SVE when available. +define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: urem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: urem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: mls v0.4s, v5.4s, v2.4s +; CHECK-NEXT: mls v1.4s, v4.4s, v3.4s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = urem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +; Vector i64 udiv are not legal for NEON so use SVE when available. +define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: urem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: sub d0, d0, d1 +; CHECK-NEXT: ret + %res = urem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +; Vector i64 udiv are not legal for NEON so use SVE when available. +define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: urem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mul x8, x8, x11 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %res = urem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: urem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z3.d +; CHECK-NEXT: fmov x12, d4 +; CHECK-NEXT: fmov x13, d3 +; CHECK-NEXT: fmov x14, d0 +; CHECK-NEXT: mov x10, v3.d[1] +; CHECK-NEXT: mov x11, v4.d[1] +; CHECK-NEXT: mul x9, x12, x9 +; CHECK-NEXT: mov x15, v0.d[1] +; CHECK-NEXT: mul x12, x14, x13 +; CHECK-NEXT: mul x8, x11, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: mul x10, x15, x10 +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v3.d[1], x10 +; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: sub v1.2d, v2.2d, v3.2d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = urem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -1,4 +1,3 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512 @@ -17,9 +16,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; SREM ;