diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1087,6 +1087,7 @@ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; + SDValue LowerAVGFloor_AVGCeil(SDValue Node, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1289,12 +1289,10 @@ setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); - if (Subtarget->hasSVE2()) { - setOperationAction(ISD::AVGFLOORS, VT, Custom); - setOperationAction(ISD::AVGFLOORU, VT, Custom); - setOperationAction(ISD::AVGCEILS, VT, Custom); - setOperationAction(ISD::AVGCEILU, VT, Custom); - } + setOperationAction(ISD::AVGFLOORS, VT, Custom); + setOperationAction(ISD::AVGFLOORU, VT, Custom); + setOperationAction(ISD::AVGCEILS, VT, Custom); + setOperationAction(ISD::AVGCEILU, VT, Custom); } // Illegal unpacked integer vector types. @@ -6089,13 +6087,21 @@ case ISD::ABDU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); case ISD::AVGFLOORS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGFLOORU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGCEILS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGCEILU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::BITREVERSE: return LowerBitreverse(Op, DAG); case ISD::BSWAP: @@ -13378,6 +13384,80 @@ return Chain; } +static bool IsZeroExtended(SDValue Node) { + APInt SplatVal; + if (Node.getOpcode() == ISD::AND) + if (ISD::isConstantSplatVector(Node.getOperand(1).getNode(), SplatVal)) + if (SplatVal.isMask() && SplatVal.countPopulation() < + Node->getValueType(0).getScalarSizeInBits()) + return true; + + return false; +} + +static bool IsSignExtended(SDValue Node) { + if (Node.getOpcode() != ISD::SIGN_EXTEND_INREG) + return false; + return cast(Node.getOperand(1))->getVT().getScalarSizeInBits() < + Node->getValueType(0).getScalarSizeInBits(); +} + +SDValue AArch64TargetLowering::LowerAVGFloor_AVGCeil(SDValue Node, + SelectionDAG &DAG) const { + SDLoc dl(Node); + SDValue OpA = Node->getOperand(0); + SDValue OpB = Node->getOperand(1); + EVT VT = Node->getValueType(0); + SDValue ConstantOne = DAG.getConstant(1, dl, VT); + + bool IsCeil = false; + if ((Node->getOpcode() == ISD::AVGCEILS || + Node->getOpcode() == ISD::AVGCEILU)) + IsCeil = true; + + bool IsSigned = false; + if ((Node->getOpcode() == ISD::AVGFLOORS || + Node->getOpcode() == ISD::AVGCEILS)) + IsSigned = true; + + unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL; + + assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); + + // check if it's better to emit the original code: + if (IsZeroExtended(OpA)) { + // in this case omiting the original code is better than custom lowering + // AVGFloor/Ceil + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB); + if (IsCeil) + Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne); + return DAG.getNode(ISD::SRL, dl, VT, Add, ConstantOne); + } + + else if (IsSignExtended(OpA)) { + // in this case omiting the original code is better than custom lowering + // AVGFloor/Ceil + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB); + if (IsCeil) + Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne); + return DAG.getNode(ISD::SRA, dl, VT, Add, ConstantOne); + } + + SDValue ShiftOpA, ShiftOpB; + ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne); + ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne); + + SDValue tmp; + if (IsCeil) + tmp = DAG.getNode(ISD::OR, dl, VT, OpA, OpB); + else + tmp = DAG.getNode(ISD::AND, dl, VT, OpA, OpB); + + tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB); + return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); +} + SDValue AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/AArch64/sve-avg-floor-ceil.ll b/llvm/test/CodeGen/AArch64/sve-avg-floor-ceil.ll --- a/llvm/test/CodeGen/AArch64/sve-avg-floor-ceil.ll +++ b/llvm/test/CodeGen/AArch64/sve-avg-floor-ceil.ll @@ -3,6 +3,55 @@ ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve2 | FileCheck %s -check-prefixes=CHECK,SVE2 ; +define @hadds_v2i64( %s0, %s1) { +; SVE-LABEL: hadds_v2i64: +; SVE: // %bb.0: // %entry +; SVE-NEXT: asr z2.d, z1.d, #1 +; SVE-NEXT: asr z3.d, z0.d, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.d, z3.d, z2.d +; SVE-NEXT: and z0.d, z0.d, #0x1 +; SVE-NEXT: add z0.d, z1.d, z0.d +; SVE-NEXT: ret +; +; SVE2-LABEL: hadds_v2i64: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: shadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = ashr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v2i64( %s0, %s1) { +; SVE-LABEL: haddu_v2i64: +; SVE: // %bb.0: // %entry +; SVE-NEXT: lsr z2.d, z1.d, #1 +; SVE-NEXT: lsr z3.d, z0.d, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.d, z3.d, z2.d +; SVE-NEXT: and z0.d, z0.d, #0x1 +; SVE-NEXT: add z0.d, z1.d, z0.d +; SVE-NEXT: ret +; +; SVE2-LABEL: haddu_v2i64: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: uhadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} define @hadds_v2i32( %s0, %s1) { ; SVE-LABEL: hadds_v2i32: @@ -56,15 +105,12 @@ define @hadds_v4i32( %s0, %s1) { ; SVE-LABEL: hadds_v4i32: ; SVE: // %bb.0: // %entry -; SVE-NEXT: sunpkhi z2.d, z0.s -; SVE-NEXT: sunpklo z0.d, z0.s -; SVE-NEXT: sunpkhi z3.d, z1.s -; SVE-NEXT: sunpklo z1.d, z1.s -; SVE-NEXT: add z0.d, z0.d, z1.d -; SVE-NEXT: add z1.d, z2.d, z3.d -; SVE-NEXT: lsr z1.d, z1.d, #1 -; SVE-NEXT: lsr z0.d, z0.d, #1 -; SVE-NEXT: uzp1 z0.s, z0.s, z1.s +; SVE-NEXT: asr z2.s, z1.s, #1 +; SVE-NEXT: asr z3.s, z0.s, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.s, z3.s, z2.s +; SVE-NEXT: and z0.s, z0.s, #0x1 +; SVE-NEXT: add z0.s, z1.s, z0.s ; SVE-NEXT: ret ; ; SVE2-LABEL: hadds_v4i32: @@ -84,15 +130,12 @@ define @haddu_v4i32( %s0, %s1) { ; SVE-LABEL: haddu_v4i32: ; SVE: // %bb.0: // %entry -; SVE-NEXT: uunpkhi z2.d, z0.s -; SVE-NEXT: uunpklo z0.d, z0.s -; SVE-NEXT: uunpkhi z3.d, z1.s -; SVE-NEXT: uunpklo z1.d, z1.s -; SVE-NEXT: add z0.d, z0.d, z1.d -; SVE-NEXT: add z1.d, z2.d, z3.d -; SVE-NEXT: lsr z1.d, z1.d, #1 -; SVE-NEXT: lsr z0.d, z0.d, #1 -; SVE-NEXT: uzp1 z0.s, z0.s, z1.s +; SVE-NEXT: lsr z2.s, z1.s, #1 +; SVE-NEXT: lsr z3.s, z0.s, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.s, z3.s, z2.s +; SVE-NEXT: and z0.s, z0.s, #0x1 +; SVE-NEXT: add z0.s, z1.s, z0.s ; SVE-NEXT: ret ; ; SVE2-LABEL: haddu_v4i32: @@ -214,15 +257,12 @@ define @hadds_v8i16( %s0, %s1) { ; SVE-LABEL: hadds_v8i16: ; SVE: // %bb.0: // %entry -; SVE-NEXT: sunpkhi z2.s, z0.h -; SVE-NEXT: sunpklo z0.s, z0.h -; SVE-NEXT: sunpkhi z3.s, z1.h -; SVE-NEXT: sunpklo z1.s, z1.h -; SVE-NEXT: add z0.s, z0.s, z1.s -; SVE-NEXT: add z1.s, z2.s, z3.s -; SVE-NEXT: lsr z1.s, z1.s, #1 -; SVE-NEXT: lsr z0.s, z0.s, #1 -; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: asr z2.h, z1.h, #1 +; SVE-NEXT: asr z3.h, z0.h, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.h, z3.h, z2.h +; SVE-NEXT: and z0.h, z0.h, #0x1 +; SVE-NEXT: add z0.h, z1.h, z0.h ; SVE-NEXT: ret ; ; SVE2-LABEL: hadds_v8i16: @@ -242,15 +282,12 @@ define @haddu_v8i16( %s0, %s1) { ; SVE-LABEL: haddu_v8i16: ; SVE: // %bb.0: // %entry -; SVE-NEXT: uunpkhi z2.s, z0.h -; SVE-NEXT: uunpklo z0.s, z0.h -; SVE-NEXT: uunpkhi z3.s, z1.h -; SVE-NEXT: uunpklo z1.s, z1.h -; SVE-NEXT: add z0.s, z0.s, z1.s -; SVE-NEXT: add z1.s, z2.s, z3.s -; SVE-NEXT: lsr z1.s, z1.s, #1 -; SVE-NEXT: lsr z0.s, z0.s, #1 -; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: lsr z2.h, z1.h, #1 +; SVE-NEXT: lsr z3.h, z0.h, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.h, z3.h, z2.h +; SVE-NEXT: and z0.h, z0.h, #0x1 +; SVE-NEXT: add z0.h, z1.h, z0.h ; SVE-NEXT: ret ; ; SVE2-LABEL: haddu_v8i16: @@ -372,15 +409,12 @@ define @hadds_v16i8( %s0, %s1) { ; SVE-LABEL: hadds_v16i8: ; SVE: // %bb.0: // %entry -; SVE-NEXT: sunpkhi z2.h, z0.b -; SVE-NEXT: sunpklo z0.h, z0.b -; SVE-NEXT: sunpkhi z3.h, z1.b -; SVE-NEXT: sunpklo z1.h, z1.b -; SVE-NEXT: add z0.h, z0.h, z1.h -; SVE-NEXT: add z1.h, z2.h, z3.h -; SVE-NEXT: lsr z1.h, z1.h, #1 -; SVE-NEXT: lsr z0.h, z0.h, #1 -; SVE-NEXT: uzp1 z0.b, z0.b, z1.b +; SVE-NEXT: asr z2.b, z1.b, #1 +; SVE-NEXT: asr z3.b, z0.b, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.b, z3.b, z2.b +; SVE-NEXT: and z0.b, z0.b, #0x1 +; SVE-NEXT: add z0.b, z1.b, z0.b ; SVE-NEXT: ret ; ; SVE2-LABEL: hadds_v16i8: @@ -400,15 +434,12 @@ define @haddu_v16i8( %s0, %s1) { ; SVE-LABEL: haddu_v16i8: ; SVE: // %bb.0: // %entry -; SVE-NEXT: uunpkhi z2.h, z0.b -; SVE-NEXT: uunpklo z0.h, z0.b -; SVE-NEXT: uunpkhi z3.h, z1.b -; SVE-NEXT: uunpklo z1.h, z1.b -; SVE-NEXT: add z0.h, z0.h, z1.h -; SVE-NEXT: add z1.h, z2.h, z3.h -; SVE-NEXT: lsr z1.h, z1.h, #1 -; SVE-NEXT: lsr z0.h, z0.h, #1 -; SVE-NEXT: uzp1 z0.b, z0.b, z1.b +; SVE-NEXT: lsr z2.b, z1.b, #1 +; SVE-NEXT: lsr z3.b, z0.b, #1 +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: add z1.b, z3.b, z2.b +; SVE-NEXT: and z0.b, z0.b, #0x1 +; SVE-NEXT: add z0.b, z1.b, z0.b ; SVE-NEXT: ret ; ; SVE2-LABEL: haddu_v16i8: @@ -477,18 +508,12 @@ define @rhadds_v4i32( %s0, %s1) { ; SVE-LABEL: rhadds_v4i32: ; SVE: // %bb.0: // %entry -; SVE-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; SVE-NEXT: sunpkhi z3.d, z0.s -; SVE-NEXT: sunpklo z0.d, z0.s -; SVE-NEXT: sunpkhi z4.d, z1.s -; SVE-NEXT: sunpklo z1.d, z1.s -; SVE-NEXT: eor z0.d, z0.d, z2.d -; SVE-NEXT: eor z2.d, z3.d, z2.d -; SVE-NEXT: sub z0.d, z1.d, z0.d -; SVE-NEXT: sub z1.d, z4.d, z2.d -; SVE-NEXT: lsr z0.d, z0.d, #1 -; SVE-NEXT: lsr z1.d, z1.d, #1 -; SVE-NEXT: uzp1 z0.s, z0.s, z1.s +; SVE-NEXT: asr z2.s, z1.s, #1 +; SVE-NEXT: asr z3.s, z0.s, #1 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: add z1.s, z3.s, z2.s +; SVE-NEXT: and z0.s, z0.s, #0x1 +; SVE-NEXT: add z0.s, z1.s, z0.s ; SVE-NEXT: ret ; ; SVE2-LABEL: rhadds_v4i32: @@ -509,18 +534,12 @@ define @rhaddu_v4i32( %s0, %s1) { ; SVE-LABEL: rhaddu_v4i32: ; SVE: // %bb.0: // %entry -; SVE-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; SVE-NEXT: uunpkhi z3.d, z0.s -; SVE-NEXT: uunpklo z0.d, z0.s -; SVE-NEXT: uunpkhi z4.d, z1.s -; SVE-NEXT: uunpklo z1.d, z1.s -; SVE-NEXT: eor z0.d, z0.d, z2.d -; SVE-NEXT: eor z2.d, z3.d, z2.d -; SVE-NEXT: sub z0.d, z1.d, z0.d -; SVE-NEXT: sub z1.d, z4.d, z2.d -; SVE-NEXT: lsr z0.d, z0.d, #1 -; SVE-NEXT: lsr z1.d, z1.d, #1 -; SVE-NEXT: uzp1 z0.s, z0.s, z1.s +; SVE-NEXT: lsr z2.s, z1.s, #1 +; SVE-NEXT: lsr z3.s, z0.s, #1 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: add z1.s, z3.s, z2.s +; SVE-NEXT: and z0.s, z0.s, #0x1 +; SVE-NEXT: add z0.s, z1.s, z0.s ; SVE-NEXT: ret ; ; SVE2-LABEL: rhaddu_v4i32: @@ -631,18 +650,12 @@ define @rhadds_v8i16( %s0, %s1) { ; SVE-LABEL: rhadds_v8i16: ; SVE: // %bb.0: // %entry -; SVE-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; SVE-NEXT: sunpkhi z3.s, z0.h -; SVE-NEXT: sunpklo z0.s, z0.h -; SVE-NEXT: sunpkhi z4.s, z1.h -; SVE-NEXT: sunpklo z1.s, z1.h -; SVE-NEXT: eor z0.d, z0.d, z2.d -; SVE-NEXT: eor z2.d, z3.d, z2.d -; SVE-NEXT: sub z0.s, z1.s, z0.s -; SVE-NEXT: sub z1.s, z4.s, z2.s -; SVE-NEXT: lsr z0.s, z0.s, #1 -; SVE-NEXT: lsr z1.s, z1.s, #1 -; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: asr z2.h, z1.h, #1 +; SVE-NEXT: asr z3.h, z0.h, #1 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: add z1.h, z3.h, z2.h +; SVE-NEXT: and z0.h, z0.h, #0x1 +; SVE-NEXT: add z0.h, z1.h, z0.h ; SVE-NEXT: ret ; ; SVE2-LABEL: rhadds_v8i16: @@ -663,18 +676,12 @@ define @rhaddu_v8i16( %s0, %s1) { ; SVE-LABEL: rhaddu_v8i16: ; SVE: // %bb.0: // %entry -; SVE-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; SVE-NEXT: uunpkhi z3.s, z0.h -; SVE-NEXT: uunpklo z0.s, z0.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z1.s, z1.h -; SVE-NEXT: eor z0.d, z0.d, z2.d -; SVE-NEXT: eor z2.d, z3.d, z2.d -; SVE-NEXT: sub z0.s, z1.s, z0.s -; SVE-NEXT: sub z1.s, z4.s, z2.s -; SVE-NEXT: lsr z0.s, z0.s, #1 -; SVE-NEXT: lsr z1.s, z1.s, #1 -; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: lsr z2.h, z1.h, #1 +; SVE-NEXT: lsr z3.h, z0.h, #1 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: add z1.h, z3.h, z2.h +; SVE-NEXT: and z0.h, z0.h, #0x1 +; SVE-NEXT: add z0.h, z1.h, z0.h ; SVE-NEXT: ret ; ; SVE2-LABEL: rhaddu_v8i16: @@ -785,18 +792,12 @@ define @rhadds_v16i8( %s0, %s1) { ; SVE-LABEL: rhadds_v16i8: ; SVE: // %bb.0: // %entry -; SVE-NEXT: mov z2.h, #-1 // =0xffffffffffffffff -; SVE-NEXT: sunpkhi z3.h, z0.b -; SVE-NEXT: sunpklo z0.h, z0.b -; SVE-NEXT: sunpkhi z4.h, z1.b -; SVE-NEXT: sunpklo z1.h, z1.b -; SVE-NEXT: eor z0.d, z0.d, z2.d -; SVE-NEXT: eor z2.d, z3.d, z2.d -; SVE-NEXT: sub z0.h, z1.h, z0.h -; SVE-NEXT: sub z1.h, z4.h, z2.h -; SVE-NEXT: lsr z0.h, z0.h, #1 -; SVE-NEXT: lsr z1.h, z1.h, #1 -; SVE-NEXT: uzp1 z0.b, z0.b, z1.b +; SVE-NEXT: asr z2.b, z1.b, #1 +; SVE-NEXT: asr z3.b, z0.b, #1 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: add z1.b, z3.b, z2.b +; SVE-NEXT: and z0.b, z0.b, #0x1 +; SVE-NEXT: add z0.b, z1.b, z0.b ; SVE-NEXT: ret ; ; SVE2-LABEL: rhadds_v16i8: @@ -817,18 +818,12 @@ define @rhaddu_v16i8( %s0, %s1) { ; SVE-LABEL: rhaddu_v16i8: ; SVE: // %bb.0: // %entry -; SVE-NEXT: mov z2.h, #-1 // =0xffffffffffffffff -; SVE-NEXT: uunpkhi z3.h, z0.b -; SVE-NEXT: uunpklo z0.h, z0.b -; SVE-NEXT: uunpkhi z4.h, z1.b -; SVE-NEXT: uunpklo z1.h, z1.b -; SVE-NEXT: eor z0.d, z0.d, z2.d -; SVE-NEXT: eor z2.d, z3.d, z2.d -; SVE-NEXT: sub z0.h, z1.h, z0.h -; SVE-NEXT: sub z1.h, z4.h, z2.h -; SVE-NEXT: lsr z0.h, z0.h, #1 -; SVE-NEXT: lsr z1.h, z1.h, #1 -; SVE-NEXT: uzp1 z0.b, z0.b, z1.b +; SVE-NEXT: lsr z2.b, z1.b, #1 +; SVE-NEXT: lsr z3.b, z0.b, #1 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: add z1.b, z3.b, z2.b +; SVE-NEXT: and z0.b, z0.b, #0x1 +; SVE-NEXT: add z0.b, z1.b, z0.b ; SVE-NEXT: ret ; ; SVE2-LABEL: rhaddu_v16i8: