diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1030,6 +1030,18 @@ EVT NVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_ceil(MinWidth)); if (VT.isVector()) NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount()); + + if ((ExtOpA.getOpcode() == ISD::ZERO_EXTEND || + ExtOpA.getOpcode() == ISD::SIGN_EXTEND) && + (ExtOpA.getOperand(0).getOpcode() == ISD::TRUNCATE) && + (ExtOpA.getOpcode() == ISD::ZERO_EXTEND || + ExtOpA.getOpcode() == ISD::SIGN_EXTEND) && + (ExtOpA.getOperand(0).getOpcode() == ISD::TRUNCATE)) { + // check if the extended node was previously truncated + ExtOpA = ExtOpA.getOperand(0).getOperand(0); + ExtOpB = ExtOpB.getOperand(0).getOperand(0); + } + if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) { // If we could not transform, and (both) adds are nuw/nsw, we can use the // larger type size to do the transform. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1087,6 +1087,7 @@ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; + SDValue LowerAVGFloor_AVGCeil(SDValue Node, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1289,12 +1289,10 @@ setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); - if (Subtarget->hasSVE2()) { - setOperationAction(ISD::AVGFLOORS, VT, Custom); - setOperationAction(ISD::AVGFLOORU, VT, Custom); - setOperationAction(ISD::AVGCEILS, VT, Custom); - setOperationAction(ISD::AVGCEILU, VT, Custom); - } + setOperationAction(ISD::AVGFLOORS, VT, Custom); + setOperationAction(ISD::AVGFLOORU, VT, Custom); + setOperationAction(ISD::AVGCEILS, VT, Custom); + setOperationAction(ISD::AVGCEILU, VT, Custom); } // Illegal unpacked integer vector types. @@ -6089,13 +6087,21 @@ case ISD::ABDU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); case ISD::AVGFLOORS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGFLOORU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGCEILS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGCEILU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); + if (Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::BITREVERSE: return LowerBitreverse(Op, DAG); case ISD::BSWAP: @@ -13376,6 +13382,42 @@ return Chain; } +SDValue AArch64TargetLowering::LowerAVGFloor_AVGCeil(SDValue Node, + SelectionDAG &DAG) const { + SDLoc dl(Node); + SDValue OpA = Node->getOperand(0); + SDValue OpB = Node->getOperand(1); + EVT VT = Node->getValueType(0); + SDValue ConstantOne = DAG.getConstant(1, dl, VT); + + assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); + + SDValue srlA = DAG.getNode(ISD::SRL, dl, VT, OpA, ConstantOne); + SDValue srlB = DAG.getNode(ISD::SRL, dl, VT, OpB, ConstantOne); + + SDValue tmp; + ConstantSDNode *OpBC = dyn_cast(OpB); + if (Node->getOpcode() == ISD::AVGFLOORU || + Node->getOpcode() == ISD::AVGFLOORS) { + if (OpBC && OpBC->isOne()) + tmp = DAG.getNode(ISD::AND, dl, VT, OpA, ConstantOne); + else { + tmp = DAG.getNode(ISD::AND, dl, VT, OpA, OpB); + tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); + } + } else { + if (OpBC && OpBC->isOne()) + tmp = OpB; + else { + tmp = DAG.getNode(ISD::OR, dl, VT, OpA, OpB); + tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); + } + } + + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, srlA, srlB); + return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); +} + SDValue AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -810,10 +810,6 @@ define <4 x i16> @hadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) { ; CHECK-LABEL: hadd8_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: shl.4h v0, v0, #8 -; CHECK-NEXT: shl.4h v1, v1, #8 -; CHECK-NEXT: sshr.4h v0, v0, #8 -; CHECK-NEXT: sshr.4h v1, v1, #8 ; CHECK-NEXT: shadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i8> %src1 to <4 x i16> @@ -826,8 +822,6 @@ define <4 x i16> @hadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) { ; CHECK-LABEL: hadd8_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: bic.4h v1, #255, lsl #8 ; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> @@ -856,8 +850,6 @@ define <4 x i16> @hadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) { ; CHECK-LABEL: hadd8_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: bic.4h v1, #255, lsl #8 ; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll b/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll --- a/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll +++ b/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll @@ -1,6 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 | FileCheck %s -; + +define @hadds_v2i64( %s0, %s1) { +; CHECK-LABEL: hadds_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v2i64( %s0, %s1) { +; CHECK-LABEL: haddu_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} define @hadds_v2i32( %s0, %s1) { ; CHECK-LABEL: hadds_v2i32: @@ -22,9 +59,12 @@ define @haddu_v2i32( %s0, %s1) { ; CHECK-LABEL: haddu_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: adr z0.d, [z0.d, z1.d, uxtw] -; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -38,15 +78,12 @@ define @hadds_v4i32( %s0, %s1) { ; CHECK-LABEL: hadds_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.d, z0.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpkhi z3.d, z1.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: add z1.d, z2.d, z3.d -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -60,15 +97,12 @@ define @haddu_v4i32( %s0, %s1) { ; CHECK-LABEL: haddu_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpkhi z3.d, z1.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: add z1.d, z2.d, z3.d -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -101,10 +135,14 @@ define @haddu_v2i16( %s0, %s1) { ; CHECK-LABEL: haddu_v2i16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: and z1.d, z1.d, #0xffff -; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: lsr z1.d, z1.d, #1 ; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -136,10 +174,12 @@ define @haddu_v4i16( %s0, %s1) { ; CHECK-LABEL: haddu_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.s, z0.s, #0xffff -; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -153,15 +193,12 @@ define @hadds_v8i16( %s0, %s1) { ; CHECK-LABEL: hadds_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.s, z0.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpkhi z3.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z1.s, z2.s, z3.s -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -175,15 +212,12 @@ define @haddu_v8i16( %s0, %s1) { ; CHECK-LABEL: haddu_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.s, z0.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpkhi z3.s, z1.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z1.s, z2.s, z3.s -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -216,10 +250,14 @@ define @haddu_v4i8( %s0, %s1) { ; CHECK-LABEL: haddu_v4i8: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: lsr z1.s, z1.s, #1 ; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -251,10 +289,12 @@ define @haddu_v8i8( %s0, %s1) { ; CHECK-LABEL: haddu_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -268,15 +308,12 @@ define @hadds_v16i8( %s0, %s1) { ; CHECK-LABEL: hadds_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.h, z0.b -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z3.h, z1.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: add z1.h, z2.h, z3.h -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -290,15 +327,12 @@ define @haddu_v16i8( %s0, %s1) { ; CHECK-LABEL: haddu_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z3.h, z1.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: add z1.h, z2.h, z3.h -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -309,6 +343,46 @@ ret %s2 } +define @rhadds_v2i64( %s0, %s1) { +; CHECK-LABEL: rhadds_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v2i64( %s0, %s1) { +; CHECK-LABEL: rhaddu_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + define @rhadds_v2i32( %s0, %s1) { ; CHECK-LABEL: rhadds_v2i32: ; CHECK: // %bb.0: // %entry @@ -333,12 +407,12 @@ define @rhaddu_v2i32( %s0, %s1) { ; CHECK-LABEL: rhaddu_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, #0xffffffff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -353,18 +427,12 @@ define @rhadds_v4i32( %s0, %s1) { ; CHECK-LABEL: rhadds_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.d, z0.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: mov z4.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sunpkhi z3.d, z1.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: eor z0.d, z0.d, z4.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: eor z1.d, z2.d, z4.d -; CHECK-NEXT: sub z1.d, z3.d, z1.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -379,18 +447,12 @@ define @rhaddu_v4i32( %s0, %s1) { ; CHECK-LABEL: rhaddu_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: mov z4.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uunpkhi z3.d, z1.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: eor z0.d, z0.d, z4.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: eor z1.d, z2.d, z4.d -; CHECK-NEXT: sub z1.d, z3.d, z1.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -468,12 +530,12 @@ define @rhaddu_v4i16( %s0, %s1) { ; CHECK-LABEL: rhaddu_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.s, z0.s, #0xffff -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -488,18 +550,12 @@ define @rhadds_v8i16( %s0, %s1) { ; CHECK-LABEL: rhadds_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.s, z0.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: mov z4.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sunpkhi z3.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: eor z0.d, z0.d, z4.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: eor z1.d, z2.d, z4.d -; CHECK-NEXT: sub z1.s, z3.s, z1.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -514,18 +570,12 @@ define @rhaddu_v8i16( %s0, %s1) { ; CHECK-LABEL: rhaddu_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.s, z0.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: mov z4.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uunpkhi z3.s, z1.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: eor z0.d, z0.d, z4.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: eor z1.d, z2.d, z4.d -; CHECK-NEXT: sub z1.s, z3.s, z1.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: lsr z1.s, z1.s, #1 -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -603,12 +653,12 @@ define @rhaddu_v8i8( %s0, %s1) { ; CHECK-LABEL: rhaddu_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -623,18 +673,12 @@ define @rhadds_v16i8( %s0, %s1) { ; CHECK-LABEL: rhadds_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z2.h, z0.b -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: mov z4.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sunpkhi z3.h, z1.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: eor z0.d, z0.d, z4.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: eor z1.d, z2.d, z4.d -; CHECK-NEXT: sub z1.h, z3.h, z1.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: ret entry: %s0s = sext %s0 to @@ -649,18 +693,12 @@ define @rhaddu_v16i8( %s0, %s1) { ; CHECK-LABEL: rhaddu_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: mov z4.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uunpkhi z3.h, z1.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: eor z0.d, z0.d, z4.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: eor z1.d, z2.d, z4.d -; CHECK-NEXT: sub z1.h, z3.h, z1.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 -; CHECK-NEXT: lsr z1.h, z1.h, #1 -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: ret entry: %s0s = zext %s0 to diff --git a/llvm/test/CodeGen/AArch64/sve2-hadd.ll b/llvm/test/CodeGen/AArch64/sve2-hadd.ll --- a/llvm/test/CodeGen/AArch64/sve2-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve2-hadd.ll @@ -52,8 +52,6 @@ ; CHECK-LABEL: haddu_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: uhadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -153,8 +151,6 @@ ; CHECK-LABEL: haddu_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: and z0.s, z0.s, #0xffff -; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: uhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -254,8 +250,6 @@ ; CHECK-LABEL: haddu_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: uhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -354,8 +348,6 @@ ; CHECK-LABEL: rhaddu_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: urhadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -467,8 +459,6 @@ ; CHECK-LABEL: rhaddu_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: and z0.s, z0.s, #0xffff -; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: urhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -580,8 +570,6 @@ ; CHECK-LABEL: rhaddu_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: urhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -49,8 +49,6 @@ define arm_aapcs_vfpcc <4 x i16> @vhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: vhaddu_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: @@ -147,8 +145,6 @@ define arm_aapcs_vfpcc <8 x i8> @vhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: vhaddu_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: @@ -241,8 +237,6 @@ define arm_aapcs_vfpcc <4 x i16> @vrhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: vrhaddu_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vrhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: @@ -352,8 +346,6 @@ define arm_aapcs_vfpcc <8 x i8> @vrhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: vrhaddu_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vrhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: