Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -249,6 +249,9 @@ SADDV, UADDV, + // Unsigned sum Long across Vector + UADDLV, + // Add Pairwise of two vectors ADDP, // Add Long Pairwise Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2420,6 +2420,7 @@ MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::UADDV) + MAKE_CASE(AArch64ISD::UADDLV) MAKE_CASE(AArch64ISD::SDOT) MAKE_CASE(AArch64ISD::UDOT) MAKE_CASE(AArch64ISD::SMINV) @@ -5315,6 +5316,20 @@ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::aarch64_neon_uaddlv: { + EVT OpVT = Op.getOperand(1).getValueType(); + EVT ResVT = Op.getValueType(); + if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) { + // In order to avoid insert_subvector, used v4i32 than v2i32. + SDValue UADDLV = + DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1)); + SDValue EXTRACT_VEC_ELT = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV, + DAG.getConstant(0, dl, MVT::i64)); + return EXTRACT_VEC_ELT; + } + return SDValue(); + } } } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -752,6 +752,7 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64uaddlv : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>; def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), [(abdu node:$lhs, node:$rhs), @@ -6461,6 +6462,12 @@ (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)), ssub))>; +def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>; + +def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>; + // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. // In effect, opNode is the same as (scalar_to_vector (IntNode)). Index: llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -95,8 +95,8 @@ ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: uaddlv.16b h0, v0 ; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: ucvtf.2s v1, v1 -; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ucvtf.2s v0, v1 +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret entry: Index: llvm/test/CodeGen/AArch64/dp1.ll =================================================================== --- llvm/test/CodeGen/AArch64/dp1.ll +++ llvm/test/CodeGen/AArch64/dp1.ll @@ -205,8 +205,7 @@ ; CHECK-SDAG-NEXT: fmov d0, x9 ; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b ; CHECK-SDAG-NEXT: uaddlv h0, v0.8b -; CHECK-SDAG-NEXT: fmov w9, s0 -; CHECK-SDAG-NEXT: str w9, [x8] +; CHECK-SDAG-NEXT: str s0, [x8] ; CHECK-SDAG-NEXT: ret ; ; CHECK-GISEL-LABEL: ctpop_i32: Index: llvm/test/CodeGen/AArch64/neon-addlv.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-addlv.ll +++ llvm/test/CodeGen/AArch64/neon-addlv.ll @@ -177,3 +177,21 @@ %0 = and i32 %vaddlv.i, 65535 ret i32 %0 } + +define dso_local <8 x i8> @bar(<8 x i8> noundef %a) local_unnamed_addr #0 { +; CHECK-LABEL: bar: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: rshrn v0.8b, v0.8h, #3 +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) + %0 = trunc i32 %vaddlv.i to i16 + %vecinit.i = insertelement <8 x i16> undef, i16 %0, i64 0 + %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> poison, <8 x i32> zeroinitializer + %vrshrn_n2 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %vecinit7.i, i32 3) + ret <8 x i8> %vrshrn_n2 +} + +declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)