diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -102,9 +102,13 @@ FMINNM_PRED, FMUL_PRED, FSUB_PRED, + HADDS_PRED, + HADDU_PRED, MUL_PRED, MULHS_PRED, MULHU_PRED, + RHADDS_PRED, + RHADDU_PRED, SDIV_PRED, SHL_PRED, SMAX_PRED, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1249,6 +1249,13 @@ setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); + + if (Subtarget->hasSVE2()) { + setOperationAction(ISD::AVGFLOORS, VT, Custom); + setOperationAction(ISD::AVGFLOORU, VT, Custom); + setOperationAction(ISD::AVGCEILS, VT, Custom); + setOperationAction(ISD::AVGCEILU, VT, Custom); + } } // Illegal unpacked integer vector types. @@ -2219,9 +2226,13 @@ MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ABDS_PRED) MAKE_CASE(AArch64ISD::ABDU_PRED) + MAKE_CASE(AArch64ISD::HADDS_PRED) + MAKE_CASE(AArch64ISD::HADDU_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::MULHS_PRED) MAKE_CASE(AArch64ISD::MULHU_PRED) + MAKE_CASE(AArch64ISD::RHADDS_PRED) + MAKE_CASE(AArch64ISD::RHADDU_PRED) MAKE_CASE(AArch64ISD::SDIV_PRED) MAKE_CASE(AArch64ISD::SHL_PRED) MAKE_CASE(AArch64ISD::SMAX_PRED) @@ -5945,6 +5956,14 @@ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED); case ISD::ABDU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); + case ISD::AVGFLOORS: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + case ISD::AVGFLOORU: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + case ISD::AVGCEILS: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + case ISD::AVGCEILU: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); case ISD::BITREVERSE: return LowerBitreverse(Op, DAG); case ISD::BSWAP: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -189,11 +189,15 @@ def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>; def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>; def AArch64sabd_p : SDNode<"AArch64ISD::ABDS_PRED", SDT_AArch64Arith>; +def AArch64shadd_p : SDNode<"AArch64ISD::HADDS_PRED", SDT_AArch64Arith>; +def AArch64srhadd_p : SDNode<"AArch64ISD::RHADDS_PRED", SDT_AArch64Arith>; def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>; def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>; +def AArch64uhadd_p : SDNode<"AArch64ISD::HADDU_PRED", SDT_AArch64Arith>; +def AArch64urhadd_p : SDNode<"AArch64ISD::RHADDU_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; @@ -258,6 +262,19 @@ (AArch64fsub_p (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDup0))) ]>; +def AArch64shadd : PatFrags<(ops node:$pg, node:$op1, node:$op2), + [(int_aarch64_sve_shadd node:$pg, node:$op1, node:$op2), + (AArch64shadd_p node:$pg, node:$op1, node:$op2)]>; +def AArch64uhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2), + [(int_aarch64_sve_uhadd node:$pg, node:$op1, node:$op2), + (AArch64uhadd_p node:$pg, node:$op1, node:$op2)]>; +def AArch64srhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2), + [(int_aarch64_sve_srhadd node:$pg, node:$op1, node:$op2), + (AArch64srhadd_p node:$pg, node:$op1, node:$op2)]>; +def AArch64urhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2), + [(int_aarch64_sve_urhadd node:$pg, node:$op1, node:$op2), + (AArch64urhadd_p node:$pg, node:$op1, node:$op2)]>; + def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3), [(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3), (add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>; @@ -3288,12 +3305,12 @@ defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>; // SVE2 integer halving add/subtract (predicated) - defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>; - defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>; + defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", AArch64shadd>; + defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", AArch64uhadd>; defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>; defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>; - defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>; - defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>; + defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", AArch64srhadd>; + defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", AArch64urhadd>; defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>; defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>; diff --git a/llvm/test/CodeGen/AArch64/sve2-hadd.ll b/llvm/test/CodeGen/AArch64/sve2-hadd.ll --- a/llvm/test/CodeGen/AArch64/sve2-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve2-hadd.ll @@ -1,6 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple aarch64-none-eabi -mattr=+sve2 -o - | FileCheck %s +define @hadds_v2i64( %s0, %s1) { +; CHECK-LABEL: hadds_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: shadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v2i64( %s0, %s1) { +; CHECK-LABEL: haddu_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uhadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + define @hadds_v2i32( %s0, %s1) { ; CHECK-LABEL: hadds_v2i32: ; CHECK: // %bb.0: // %entry @@ -37,15 +67,8 @@ define @hadds_v4i32( %s0, %s1) { ; CHECK-LABEL: hadds_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.d, z0.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpkhi z3.d, z1.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: add z1.d, z2.d, z3.d -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: shadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -59,15 +82,8 @@ define @haddu_v4i32( %s0, %s1) { ; CHECK-LABEL: haddu_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpkhi z3.d, z1.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: add z1.d, z2.d, z3.d -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -152,15 +168,8 @@ define @hadds_v8i16( %s0, %s1) { ; CHECK-LABEL: hadds_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.s, z0.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpkhi z3.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z1.s, z2.s, z3.s -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: shadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -174,15 +183,8 @@ define @haddu_v8i16( %s0, %s1) { ; CHECK-LABEL: haddu_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.s, z0.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpkhi z3.s, z1.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z1.s, z2.s, z3.s -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -267,15 +269,8 @@ define @hadds_v16i8( %s0, %s1) { ; CHECK-LABEL: hadds_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.h, z0.b -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z3.h, z1.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: add z1.h, z2.h, z3.h -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: shadd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -289,15 +284,8 @@ define @haddu_v16i8( %s0, %s1) { ; CHECK-LABEL: haddu_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z3.h, z1.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: add z1.h, z2.h, z3.h -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uhadd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -308,6 +296,38 @@ ret %s2 } +define @rhadds_v2i64( %s0, %s1) { +; CHECK-LABEL: rhadds_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: srhadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, %s1s + %add2 = add %add, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s = lshr %add2, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v2i64( %s0, %s1) { +; CHECK-LABEL: rhaddu_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: urhadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add %s0s, %s1s + %add2 = add %add, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s = lshr %add2, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + define @rhadds_v2i32( %s0, %s1) { ; CHECK-LABEL: rhadds_v2i32: ; CHECK: // %bb.0: // %entry @@ -352,18 +372,8 @@ define @rhadds_v4i32( %s0, %s1) { ; CHECK-LABEL: rhadds_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sunpkhi z3.d, z0.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpkhi z4.d, z1.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: eor z2.d, z3.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: sub z1.d, z4.d, z2.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: srhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -378,18 +388,8 @@ define @rhaddu_v4i32( %s0, %s1) { ; CHECK-LABEL: rhaddu_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uunpkhi z3.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpkhi z4.d, z1.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: eor z2.d, z3.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: sub z1.d, z4.d, z2.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: urhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -487,18 +487,8 @@ define @rhadds_v8i16( %s0, %s1) { ; CHECK-LABEL: rhadds_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sunpkhi z3.s, z0.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpkhi z4.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: eor z2.d, z3.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: sub z1.s, z4.s, z2.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: srhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -513,18 +503,8 @@ define @rhaddu_v8i16( %s0, %s1) { ; CHECK-LABEL: rhaddu_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uunpkhi z3.s, z0.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpkhi z4.s, z1.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: eor z2.d, z3.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: sub z1.s, z4.s, z2.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: urhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -622,18 +602,8 @@ define @rhadds_v16i8( %s0, %s1) { ; CHECK-LABEL: rhadds_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sunpkhi z3.h, z0.b -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z4.h, z1.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: eor z2.d, z3.d, z2.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: sub z1.h, z4.h, z2.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: srhadd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -648,18 +618,8 @@ define @rhaddu_v16i8( %s0, %s1) { ; CHECK-LABEL: rhaddu_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uunpkhi z3.h, z0.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z4.h, z1.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: eor z2.d, z3.d, z2.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: sub z1.h, z4.h, z2.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: urhadd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: %s0s = zext %s0 to