diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -980,8 +980,8 @@ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, - bool OverrideNEON = false) const; + SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOp) const; SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1331,6 +1331,13 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + // NEON doesn't support integer divides, but SVE does + for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v1i64, MVT::v2i64}) { + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + } + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1363,26 +1370,10 @@ setOperationAction(ISD::MULHS, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v1i64, Custom); setOperationAction(ISD::MULHU, MVT::v2i64, Custom); - setOperationAction(ISD::SDIV, MVT::v8i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i8, Custom); - setOperationAction(ISD::SDIV, MVT::v4i16, Custom); - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v2i32, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); - setOperationAction(ISD::SDIV, MVT::v1i64, Custom); - setOperationAction(ISD::SDIV, MVT::v2i64, Custom); setOperationAction(ISD::SMAX, MVT::v1i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Custom); setOperationAction(ISD::SMIN, MVT::v1i64, Custom); setOperationAction(ISD::SMIN, MVT::v2i64, Custom); - setOperationAction(ISD::UDIV, MVT::v8i8, Custom); - setOperationAction(ISD::UDIV, MVT::v16i8, Custom); - setOperationAction(ISD::UDIV, MVT::v4i16, Custom); - setOperationAction(ISD::UDIV, MVT::v8i16, Custom); - setOperationAction(ISD::UDIV, MVT::v2i32, Custom); - setOperationAction(ISD::UDIV, MVT::v4i32, Custom); - setOperationAction(ISD::UDIV, MVT::v1i64, Custom); - setOperationAction(ISD::UDIV, MVT::v2i64, Custom); setOperationAction(ISD::UMAX, MVT::v1i64, Custom); setOperationAction(ISD::UMAX, MVT::v2i64, Custom); setOperationAction(ISD::UMIN, MVT::v1i64, Custom); @@ -3956,7 +3947,7 @@ bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. @@ -5157,11 +5148,9 @@ case ISD::MUL: return LowerMUL(Op, DAG); case ISD::MULHS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); case ISD::MULHU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: @@ -5252,8 +5241,7 @@ case ISD::BSWAP: return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); case ISD::CTLZ: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::VECTOR_SPLICE: @@ -7514,17 +7502,13 @@ default: llvm_unreachable("Wrong instruction"); case ISD::SMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); case ISD::SMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); case ISD::UMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); case ISD::UMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); } } @@ -7540,8 +7524,7 @@ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, - true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); SDLoc DL(Op); SDValue REVB; @@ -11189,7 +11172,7 @@ EVT VT = Op.getValueType(); SDLoc dl(Op); - if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) + if (VT.isFixedLengthVector() && Subtarget->hasSVE()) return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); assert(VT.isScalableVector() && "Expected a scalable vector."); @@ -19224,7 +19207,7 @@ // Scalable vector i32/i64 DIV is supported. if (EltVT == MVT::i32 || EltVT == MVT::i64) - return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, PredOpcode); // Scalable vector i8/i16 DIV is not supported. Promote it to i32. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -19379,13 +19362,14 @@ // NOTE: The results for inactive lanes are undefined. SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, - unsigned NewOp, - bool OverrideNEON) const { + unsigned NewOp) const { EVT VT = Op.getValueType(); SDLoc DL(Op); auto Pg = getPredicateForVector(DAG, DL, VT); - if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) { + if (VT.isFixedLengthVector()) { + assert(VT.getFixedSizeInBits() <= Subtarget->getMinSVEVectorSizeInBits() && + "Cannot use SVE to lower fixed length predicated op!"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); // Create list of operands by converting existing ones to scalable types. @@ -19403,7 +19387,8 @@ continue; } - assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) && + assert(V.getValueType().getFixedSizeInBits() <= + Subtarget->getMinSVEVectorSizeInBits() && "Only fixed length vectors are supported!"); Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -1,4 +1,4 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512 @@ -17,14 +17,12 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; SDIV ; ; Vector vXi8 sdiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: sdiv_v8i8: ; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8 @@ -51,6 +49,21 @@ ; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7] ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]] ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v8i8: +; VBITS_EQ_128: sshll v1.8h, v1.8b, #0 +; VBITS_EQ_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h +; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_EQ_128-NEXT: xtn v0.8b, v0.8h +; VBITS_EQ_128-NEXT: ret + %res = sdiv <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -81,6 +94,30 @@ ; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h ; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v16i8: +; VBITS_EQ_128: sunpkhi z2.h, z1.b +; VBITS_EQ_128-NEXT: sunpkhi z3.h, z0.b +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sunpklo z1.h, z1.b +; VBITS_EQ_128-NEXT: sunpkhi z4.s, z2.h +; VBITS_EQ_128-NEXT: sunpkhi z5.s, z3.h +; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: sunpklo z0.h, z0.b +; VBITS_EQ_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: sunpkhi z3.s, z1.h +; VBITS_EQ_128-NEXT: sunpkhi z5.s, z0.h +; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: uzp1 z1.h, z2.h, z4.h +; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z3.h +; VBITS_EQ_128-NEXT: uzp1 z0.b, z0.b, z1.b +; VBITS_EQ_128-NEXT: ret + %res = sdiv <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -309,6 +346,7 @@ } ; Vector vXi16 sdiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: sdiv_v4i16: ; CHECK: sshll v1.4s, v1.4h, #0 @@ -323,6 +361,15 @@ ; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v4i16: +; VBITS_EQ_128: sshll v1.4s, v1.4h, #0 +; VBITS_EQ_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: xtn v0.4h, v0.4s +; VBITS_EQ_128-NEXT: ret + %res = sdiv <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -335,6 +382,18 @@ ; CHECK-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v8i16: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h +; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_EQ_128-NEXT: ret + %res = sdiv <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -472,6 +531,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 ; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v2i32: +; VBITS_EQ_128: ptrue p0.s, vl2 +; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: ret + %res = sdiv <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -482,6 +547,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 ; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v4i32: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: ret + %res = sdiv <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -552,6 +623,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 ; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v1i64: +; VBITS_EQ_128: ptrue p0.d, vl1 +; VBITS_EQ_128-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: ret + %res = sdiv <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -562,6 +639,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 ; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: sdiv_v2i64: +; VBITS_EQ_128: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: ret + %res = sdiv <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -631,6 +714,7 @@ ; ; Vector vXi8 udiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: udiv_v8i8: ; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8 @@ -657,6 +741,21 @@ ; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7] ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]] ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v8i8: +; VBITS_EQ_128: ushll v1.8h, v1.8b, #0 +; VBITS_EQ_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h +; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_EQ_128-NEXT: xtn v0.8b, v0.8h +; VBITS_EQ_128-NEXT: ret + %res = udiv <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -687,6 +786,30 @@ ; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h ; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v16i8: +; VBITS_EQ_128: uunpkhi z2.h, z1.b +; VBITS_EQ_128-NEXT: uunpkhi z3.h, z0.b +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: uunpklo z1.h, z1.b +; VBITS_EQ_128-NEXT: uunpkhi z4.s, z2.h +; VBITS_EQ_128-NEXT: uunpkhi z5.s, z3.h +; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: uunpklo z0.h, z0.b +; VBITS_EQ_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: uunpkhi z3.s, z1.h +; VBITS_EQ_128-NEXT: uunpkhi z5.s, z0.h +; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: uzp1 z1.h, z2.h, z4.h +; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z3.h +; VBITS_EQ_128-NEXT: uzp1 z0.b, z0.b, z1.b +; VBITS_EQ_128-NEXT: ret + %res = udiv <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -913,6 +1036,7 @@ } ; Vector vXi16 udiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: udiv_v4i16: ; CHECK: ushll v1.4s, v1.4h, #0 @@ -927,6 +1051,15 @@ ; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v4i16: +; VBITS_EQ_128: ushll v1.4s, v1.4h, #0 +; VBITS_EQ_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: xtn v0.4h, v0.4s +; VBITS_EQ_128-NEXT: ret + %res = udiv <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -939,6 +1072,18 @@ ; CHECK-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v8i16: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h +; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_EQ_128-NEXT: ret + %res = udiv <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -1076,6 +1221,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 ; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v2i32: +; VBITS_EQ_128: ptrue p0.s, vl2 +; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: ret + %res = udiv <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -1086,6 +1237,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 ; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v4i32: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_128-NEXT: ret + %res = udiv <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -1098,6 +1255,7 @@ ; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %res = udiv <8 x i32> %op1, %op2 @@ -1156,6 +1314,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 ; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v1i64: +; VBITS_EQ_128: ptrue p0.d, vl1 +; VBITS_EQ_128-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: ret + %res = udiv <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -1166,6 +1330,12 @@ ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 ; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: udiv_v2i64: +; VBITS_EQ_128: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: ret + %res = udiv <2 x i64> %op1, %op2 ret <2 x i64> %res } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -1,4 +1,4 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512 @@ -17,14 +17,12 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; SREM ; ; Vector vXi8 sdiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: srem_v8i8: ; CHECK: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b @@ -52,6 +50,22 @@ ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR8]] ; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b ; CHECK: ret + +; VBITS_EQ_128-LABEL: srem_v8i8: +; VBITS_EQ_128: sshll v2.8h, v1.8b, #0 +; VBITS_EQ_128-NEXT: sshll v3.8h, v0.8b, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sunpkhi z4.s, z2.h +; VBITS_EQ_128-NEXT: sunpkhi z5.s, z3.h +; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_EQ_128-NEXT: xtn v2.8b, v2.8h +; VBITS_EQ_128-NEXT: mls v0.8b, v2.8b, v1.8b +; VBITS_EQ_128-NEXT: ret + %res = srem <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -84,6 +98,31 @@ ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b ; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b ; CHECK: ret + +; VBITS_EQ_128-LABEL: srem_v16i8: +; VBITS_EQ_128: sunpkhi z2.h, z1.b +; VBITS_EQ_128-NEXT: sunpkhi z3.h, z0.b +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sunpkhi z5.s, z2.h +; VBITS_EQ_128-NEXT: sunpkhi z6.s, z3.h +; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: sunpklo z4.h, z1.b +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: sunpklo z3.h, z0.b +; VBITS_EQ_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; VBITS_EQ_128-NEXT: sunpkhi z6.s, z4.h +; VBITS_EQ_128-NEXT: sunpkhi z7.s, z3.h +; VBITS_EQ_128-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; VBITS_EQ_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z5.h +; VBITS_EQ_128-NEXT: uzp1 z3.h, z3.h, z6.h +; VBITS_EQ_128-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_EQ_128-NEXT: mls v0.16b, v2.16b, v1.16b +; VBITS_EQ_128-NEXT: ret + %res = srem <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -330,6 +369,7 @@ } ; Vector vXi16 sdiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: srem_v4i16: ; CHECK: sshll v2.4s, v1.4h, #0 @@ -345,6 +385,16 @@ ; CHECK-NEXT: mov [[VEC2]].h[3], [[SCALAR3]] ; CHECK-NEXT: mls v0.4h, [[VEC2]].4h, v1.4h ; CHECK: ret + +; VBITS_EQ_128-LABEL: srem_v4i16: +; VBITS_EQ_128: sshll v2.4s, v1.4h, #0 +; VBITS_EQ_128-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: xtn v2.4h, v2.4s +; VBITS_EQ_128-NEXT: mls v0.4h, v2.4h, v1.4h +; VBITS_EQ_128-NEXT: ret + %res = srem <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -358,6 +408,20 @@ ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h ; CHECK: ret + +; VBITS_EQ_128-LABEL: srem_v8i16: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h +; VBITS_EQ_128-NEXT: sunpklo z4.s, z1.h +; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: sunpklo z5.s, z0.h +; VBITS_EQ_128-NEXT: movprfx z3, z5 +; VBITS_EQ_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; VBITS_EQ_128-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_EQ_128-NEXT: mls v0.8h, v2.8h, v1.8h +; VBITS_EQ_128-NEXT: ret + %res = srem <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -395,6 +459,7 @@ ; VBITS_GE_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h ; VBITS_GE_512-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] ; CHECK: ret + %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %res = srem <16 x i16> %op1, %op2 @@ -513,6 +578,14 @@ ; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s ; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK: ret + +; VBITS_EQ_128-LABEL: srem_v2i32: +; VBITS_EQ_128: ptrue p0.s, vl2 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; VBITS_EQ_128-NEXT: mls v0.2s, v2.2s, v1.2s +; VBITS_EQ_128-NEXT: ret + %res = srem <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -525,6 +598,14 @@ ; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: srem_v4i32: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; VBITS_EQ_128-NEXT: mls v0.4s, v2.4s, v1.4s +; VBITS_EQ_128-NEXT: ret + %res = srem <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -602,6 +683,7 @@ } ; Vector i64 sdiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for the 128 bits case here. define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: srem_v1i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 @@ -610,11 +692,24 @@ ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d ; CHECK-NEXT: sub d0, d0, d1 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: srem_v1i64: +; VBITS_EQ_128: ptrue p0.d, vl1 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; VBITS_EQ_128-NEXT: fmov x8, d2 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: mul x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: sub d0, d0, d1 +; VBITS_EQ_128-NEXT: ret + %res = srem <1 x i64> %op1, %op2 ret <1 x i64> %res } ; Vector i64 sdiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for the 128 bits case here. define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { ; CHECK-LABEL: srem_v2i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 @@ -623,6 +718,22 @@ ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: srem_v2i64: +; VBITS_EQ_128: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; VBITS_EQ_128-NEXT: fmov x9, d2 +; VBITS_EQ_128-NEXT: fmov x10, d1 +; VBITS_EQ_128-NEXT: mov x8, v2.d[1] +; VBITS_EQ_128-NEXT: mov x11, v1.d[1] +; VBITS_EQ_128-NEXT: mul x9, x9, x10 +; VBITS_EQ_128-NEXT: mul x8, x8, x11 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: mov v1.d[1], x8 +; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d +; VBITS_EQ_128-NEXT: ret + %res = srem <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -704,6 +815,7 @@ ; ; Vector vXi8 udiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: urem_v8i8: ; CHECK: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b @@ -731,6 +843,22 @@ ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]] ; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b ; CHECK: ret + +; VBITS_EQ_128-LABEL: urem_v8i8: +; VBITS_EQ_128: ushll v2.8h, v1.8b, #0 +; VBITS_EQ_128-NEXT: ushll v3.8h, v0.8b, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: uunpkhi z4.s, z2.h +; VBITS_EQ_128-NEXT: uunpkhi z5.s, z3.h +; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_EQ_128-NEXT: xtn v2.8b, v2.8h +; VBITS_EQ_128-NEXT: mls v0.8b, v2.8b, v1.8b +; VBITS_EQ_128-NEXT: ret + %res = urem <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -763,6 +891,31 @@ ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b ; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b ; CHECK: ret + +; VBITS_EQ_128-LABEL: urem_v16i8: +; VBITS_EQ_128: uunpkhi z2.h, z1.b +; VBITS_EQ_128-NEXT: uunpkhi z3.h, z0.b +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: uunpkhi z5.s, z2.h +; VBITS_EQ_128-NEXT: uunpkhi z6.s, z3.h +; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: uunpklo z4.h, z1.b +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: uunpklo z3.h, z0.b +; VBITS_EQ_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; VBITS_EQ_128-NEXT: uunpkhi z6.s, z4.h +; VBITS_EQ_128-NEXT: uunpkhi z7.s, z3.h +; VBITS_EQ_128-NEXT: uunpklo z4.s, z4.h +; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; VBITS_EQ_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; VBITS_EQ_128-NEXT: uzp1 z2.h, z2.h, z5.h +; VBITS_EQ_128-NEXT: uzp1 z3.h, z3.h, z6.h +; VBITS_EQ_128-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_EQ_128-NEXT: mls v0.16b, v2.16b, v1.16b +; VBITS_EQ_128-NEXT: ret + %res = urem <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -1007,6 +1160,7 @@ } ; Vector vXi16 udiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for >= 256 bits here. define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: urem_v4i16: ; CHECK: ushll v2.4s, v1.4h, #0 @@ -1022,6 +1176,16 @@ ; CHECK-NEXT: mov [[VECO]].h[3], [[SCALAR3]] ; CHECK-NEXT: mls v0.4h, [[VECO]].4h, v1.4h ; CHECK: ret + +; VBITS_EQ_128-LABEL: urem_v4i16: +; VBITS_EQ_128: ushll v2.4s, v1.4h, #0 +; VBITS_EQ_128-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_EQ_128-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: xtn v2.4h, v2.4s +; VBITS_EQ_128-NEXT: mls v0.4h, v2.4h, v1.4h +; VBITS_EQ_128-NEXT: ret + %res = urem <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -1035,6 +1199,20 @@ ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h ; CHECK: ret + +; VBITS_EQ_128-LABEL: urem_v8i16: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h +; VBITS_EQ_128-NEXT: uunpklo z4.s, z1.h +; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_EQ_128-NEXT: uunpklo z5.s, z0.h +; VBITS_EQ_128-NEXT: movprfx z3, z5 +; VBITS_EQ_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; VBITS_EQ_128-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_EQ_128-NEXT: mls v0.8h, v2.8h, v1.8h +; VBITS_EQ_128-NEXT: ret + %res = urem <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -1190,6 +1368,14 @@ ; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s ; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK: ret + +; VBITS_EQ_128-LABEL: urem_v2i32: +; VBITS_EQ_128: ptrue p0.s, vl2 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; VBITS_EQ_128-NEXT: mls v0.2s, v2.2s, v1.2s +; VBITS_EQ_128-NEXT: ret + %res = urem <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -1202,6 +1388,14 @@ ; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: urem_v4i32: +; VBITS_EQ_128: ptrue p0.s, vl4 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; VBITS_EQ_128-NEXT: mls v0.4s, v2.4s, v1.4s +; VBITS_EQ_128-NEXT: ret + %res = urem <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -1279,6 +1473,7 @@ } ; Vector i64 udiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for the 128 bits case here. define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: urem_v1i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 @@ -1287,11 +1482,24 @@ ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d ; CHECK-NEXT: sub d0, d0, d1 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: urem_v1i64: +; VBITS_EQ_128: ptrue p0.d, vl1 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; VBITS_EQ_128-NEXT: fmov x8, d2 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: mul x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: sub d0, d0, d1 +; VBITS_EQ_128-NEXT: ret + %res = urem <1 x i64> %op1, %op2 ret <1 x i64> %res } ; Vector i64 udiv are not legal for NEON so use SVE when available. +; FIXME: We should be able to improve the codegen for the 128 bits case here. define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { ; CHECK-LABEL: urem_v2i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 @@ -1300,6 +1508,22 @@ ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: urem_v2i64: +; VBITS_EQ_128: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: movprfx z2, z0 +; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; VBITS_EQ_128-NEXT: fmov x9, d2 +; VBITS_EQ_128-NEXT: fmov x10, d1 +; VBITS_EQ_128-NEXT: mov x8, v2.d[1] +; VBITS_EQ_128-NEXT: mov x11, v1.d[1] +; VBITS_EQ_128-NEXT: mul x9, x9, x10 +; VBITS_EQ_128-NEXT: mul x8, x8, x11 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: mov v1.d[1], x8 +; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d +; VBITS_EQ_128-NEXT: ret + %res = urem <2 x i64> %op1, %op2 ret <2 x i64> %res }