Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12599,87 +12599,6 @@ return SDValue(); } -// Attempt to form one of the avg pagtterns from: -// truncate(shr(add(zext(OpB), zext(OpA)), 1)) -// Creating avgflooru/avgfloors/avgceilu/avgceils, with the ceiling having an -// extra rounding add: -// truncate(shr(add(zext(OpB), zext(OpA), 1), 1)) -// This starts at a truncate, meaning the shift will always be shl, as the top -// bits are known to not be demanded. -static SDValue performAvgCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - SDValue Shift = N->getOperand(0); - if (Shift.getOpcode() != ISD::SRL) - return SDValue(); - - // Is the right shift using an immediate value of 1? - ConstantSDNode *N1C = isConstOrConstSplat(Shift.getOperand(1)); - if (!N1C || !N1C->isOne()) - return SDValue(); - - // We are looking for an avgfloor - // add(ext, ext) - // or one of these as a avgceil - // add(add(ext, ext), 1) - // add(add(ext, 1), ext) - // add(ext, add(ext, 1)) - SDValue Add = Shift.getOperand(0); - if (Add.getOpcode() != ISD::ADD) - return SDValue(); - - SDValue ExtendOpA = Add.getOperand(0); - SDValue ExtendOpB = Add.getOperand(1); - auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { - ConstantSDNode *ConstOp; - if ((ConstOp = isConstOrConstSplat(Op1)) && ConstOp->isOne()) { - ExtendOpA = Op2; - ExtendOpB = Op3; - return true; - } - if ((ConstOp = isConstOrConstSplat(Op2)) && ConstOp->isOne()) { - ExtendOpA = Op1; - ExtendOpB = Op3; - return true; - } - if ((ConstOp = isConstOrConstSplat(Op3)) && ConstOp->isOne()) { - ExtendOpA = Op1; - ExtendOpB = Op2; - return true; - } - return false; - }; - bool IsCeil = (ExtendOpA.getOpcode() == ISD::ADD && - MatchOperands(ExtendOpA.getOperand(0), ExtendOpA.getOperand(1), - ExtendOpB)) || - (ExtendOpB.getOpcode() == ISD::ADD && - MatchOperands(ExtendOpB.getOperand(0), ExtendOpB.getOperand(1), - ExtendOpA)); - - unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); - unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); - if (!(ExtendOpAOpc == ExtendOpBOpc && - (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) - return SDValue(); - - // Is the result of the right shift being truncated to the same value type as - // the original operands, OpA and OpB? - SDValue OpA = ExtendOpA.getOperand(0); - SDValue OpB = ExtendOpB.getOperand(0); - EVT OpAVT = OpA.getValueType(); - assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); - if (VT != OpAVT || OpAVT != OpB.getValueType()) - return SDValue(); - - bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; - unsigned AVGOpc = IsSignExtend ? (IsCeil ? ISD::AVGCEILS : ISD::AVGFLOORS) - : (IsCeil ? ISD::AVGCEILU : ISD::AVGFLOORU); - if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(AVGOpc, VT)) - return SDValue(); - - return DAG.getNode(AVGOpc, SDLoc(N), VT, OpA, OpB); -} - SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -12966,8 +12885,6 @@ if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; - if (SDValue M = performAvgCombine(N, DAG)) - return M; // Narrow a suitable binary operation with a non-opaque constant operand by // moving it ahead of the truncate. This is limited to pre-legalization Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -905,6 +905,132 @@ Depth); } +// Attempt to form ext(avgfloor(A, B)) from shr(add(ext(A), ext(B)), 1). +// or to form ext(avgceil(A, B)) from shr(add(ext(A), ext(B), 1), 1). +static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG, + const TargetLowering &TLI, + const APInt &DemandedBits, + const APInt &DemandedElts, + unsigned Depth) { + assert((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) && + "SRL or SRA node is required here!"); + // Is the right shift using an immediate value of 1? + ConstantSDNode *N1C = isConstOrConstSplat(Op.getOperand(1), DemandedElts); + if (!N1C || !N1C->isOne()) + return SDValue(); + + // We are looking for an avgfloor + // add(ext, ext) + // or one of these as a avgceil + // add(add(ext, ext), 1) + // add(add(ext, 1), ext) + // add(ext, add(ext, 1)) + SDValue Add = Op.getOperand(0); + if (Add.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue ExtOpA = Add.getOperand(0); + SDValue ExtOpB = Add.getOperand(1); + auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { + ConstantSDNode *ConstOp; + if ((ConstOp = isConstOrConstSplat(Op1, DemandedElts)) && + ConstOp->isOne()) { + ExtOpA = Op2; + ExtOpB = Op3; + return true; + } + if ((ConstOp = isConstOrConstSplat(Op2, DemandedElts)) && + ConstOp->isOne()) { + ExtOpA = Op1; + ExtOpB = Op3; + return true; + } + if ((ConstOp = isConstOrConstSplat(Op3, DemandedElts)) && + ConstOp->isOne()) { + ExtOpA = Op1; + ExtOpB = Op2; + return true; + } + return false; + }; + bool IsCeil = + (ExtOpA.getOpcode() == ISD::ADD && + MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) || + (ExtOpB.getOpcode() == ISD::ADD && + MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA)); + + // If the shift is signed (sra): + // - Needs >= 2 sign bit for both operands. + // - Needs >= 2 zero bits. + // If the shift is unsigned (srl): + // - Needs >= 1 zero bit for both operands. + // - Needs 1 demanded bit zero and >= 2 sign bits. + unsigned ShiftOpc = Op.getOpcode(); + bool IsSigned = false; + unsigned KnownBits; + unsigned NumSignedA = DAG.ComputeNumSignBits(ExtOpA, DemandedElts, Depth); + unsigned NumSignedB = DAG.ComputeNumSignBits(ExtOpB, DemandedElts, Depth); + unsigned NumSigned = std::min(NumSignedA, NumSignedB) - 1; + unsigned NumZeroA = + DAG.computeKnownBits(ExtOpA, DemandedElts, Depth).countMinLeadingZeros(); + unsigned NumZeroB = + DAG.computeKnownBits(ExtOpB, DemandedElts, Depth).countMinLeadingZeros(); + unsigned NumZero = std::min(NumZeroA, NumZeroB); + + switch (ShiftOpc) { + default: + llvm_unreachable("Unexpected ShiftOpc in combineShiftToAVG"); + case ISD::SRA: { + if (NumZero >= 2 && NumSigned < NumZero) { + IsSigned = false; + KnownBits = NumZero; + break; + } + if (NumSigned >= 1) { + IsSigned = true; + KnownBits = NumSigned; + break; + } + return SDValue(); + } + case ISD::SRL: { + if (NumZero >= 1 && NumSigned < NumZero) { + IsSigned = false; + KnownBits = NumZero; + break; + } + if (NumSigned >= 1 && DemandedBits.isSignBitClear()) { + IsSigned = true; + KnownBits = NumSigned; + break; + } + return SDValue(); + } + } + + unsigned AVGOpc = IsCeil ? (IsSigned ? ISD::AVGCEILS : ISD::AVGCEILU) + : (IsSigned ? ISD::AVGFLOORS : ISD::AVGFLOORU); + + // Find the smallest power-2 type that is legal for this vector size and + // operation, given the original type size and the number of known sign/zero + // bits. + EVT VT = Op.getValueType(); + unsigned MinWidth = + std::max(VT.getScalarSizeInBits() - KnownBits, 8); + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), PowerOf2Ceil(MinWidth)); + if (VT.isVector()) + NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount()); + if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) + return SDValue(); + + SDLoc DL(Op); + SDValue ResultAVG = + DAG.getNode(AVGOpc, DL, NVT, DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpA), + DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpB)); + return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, + ResultAVG); +} + /// Look at Op. At this point, we know that only the OriginalDemandedBits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the @@ -1567,6 +1693,11 @@ SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); + // Try to match AVG patterns. + if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits, + DemandedElts, Depth + 1)) + return TLO.CombineTo(Op, AVG); + if (const APInt *SA = TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) { unsigned ShAmt = SA->getZExtValue(); @@ -1633,6 +1764,11 @@ if (DemandedBits.isOne()) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); + // Try to match AVG patterns. + if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits, + DemandedElts, Depth + 1)) + return TLO.CombineTo(Op, AVG); + if (const APInt *SA = TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) { unsigned ShAmt = SA->getZExtValue(); Index: llvm/test/CodeGen/AArch64/arm64-vhadd.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -705,8 +705,8 @@ define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl.4s v0, v0, v1 -; CHECK-NEXT: sshr.4s v0, v0, #1 +; CHECK-NEXT: shadd.4h v0, v0, v1 +; CHECK-NEXT: sshll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i16> %src1 to <4 x i32> %zextsrc2 = sext <4 x i16> %src2 to <4 x i32> @@ -718,8 +718,8 @@ define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.4s v0, v0, v1 -; CHECK-NEXT: ushr.4s v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -744,8 +744,8 @@ define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.4s v0, v0, v1 -; CHECK-NEXT: ushr.4s v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -759,10 +759,9 @@ define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl2.2d v2, v0, v1 -; CHECK-NEXT: saddl.2d v0, v0, v1 -; CHECK-NEXT: sshr.2d v1, v2, #1 -; CHECK-NEXT: sshr.2d v0, v0, #1 +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: sshll2.2d v1, v0, #0 +; CHECK-NEXT: sshll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i32> %src1 to <4 x i64> %zextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -774,10 +773,9 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl2.2d v2, v0, v1 -; CHECK-NEXT: uaddl.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v2, #1 -; CHECK-NEXT: ushr.2d v0, v0, #1 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -804,10 +802,9 @@ define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl2.2d v2, v0, v1 -; CHECK-NEXT: uaddl.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v2, #1 -; CHECK-NEXT: ushr.2d v0, v0, #1 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -884,9 +881,8 @@ define void @testLowerToSHADD8b_c(<8 x i8> %src1, <8 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD8b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: saddw.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v1, #10 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> @@ -900,9 +896,8 @@ define void @testLowerToSHADD4h_c(<4 x i16> %src1, <4 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD4h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: saddw.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: movi.4h v1, #10 +; CHECK-NEXT: shadd.4h v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> @@ -916,10 +911,8 @@ define void @testLowerToSHADD2s_c(<2 x i32> %src1, <2 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD2s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: saddw.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: movi.2s v1, #10 +; CHECK-NEXT: shadd.2s v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> @@ -933,12 +926,9 @@ define void @testLowerToSHADD16b_c(<16 x i8> %src1, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD16b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: saddw.8h v2, v1, v0 -; CHECK-NEXT: saddw2.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v1, v2, #1 -; CHECK-NEXT: shrn2.16b v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.16b v1, #10 +; CHECK-NEXT: shadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %add = add <16 x i16> %sextsrc1, @@ -951,12 +941,9 @@ define void @testLowerToSHADD8h_c(<8 x i16> %src1, <8 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD8h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: saddw.4s v2, v1, v0 -; CHECK-NEXT: saddw2.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v1, v2, #1 -; CHECK-NEXT: shrn2.8h v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.8h v1, #10 +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %sextsrc1, @@ -969,13 +956,9 @@ define void @testLowerToSHADD4s_c(<4 x i32> %src1, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD4s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: saddw.2d v2, v1, v0 -; CHECK-NEXT: saddw2.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v1, v2, #1 -; CHECK-NEXT: shrn2.4s v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.4s v1, #10 +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %add = add <4 x i64> %sextsrc1, @@ -988,9 +971,8 @@ define void @testLowerToUHADD8b_c(<8 x i8> %src1, <8 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD8b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: uaddw.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v1, #10 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i8> %src1 to <8 x i16> @@ -1004,9 +986,8 @@ define void @testLowerToUHADD4h_c(<4 x i16> %src1, <4 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD4h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: movi.4h v1, #10 +; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> @@ -1020,10 +1001,8 @@ define void @testLowerToUHADD2s_c(<2 x i32> %src1, <2 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD2s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: uaddw.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: movi.2s v1, #10 +; CHECK-NEXT: uhadd.2s v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i32> %src1 to <2 x i64> @@ -1037,12 +1016,9 @@ define void @testLowerToUHADD16b_c(<16 x i8> %src1, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD16b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: uaddw.8h v2, v1, v0 -; CHECK-NEXT: uaddw2.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v1, v2, #1 -; CHECK-NEXT: shrn2.16b v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.16b v1, #10 +; CHECK-NEXT: uhadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> %add = add <16 x i16> %zextsrc1, @@ -1055,12 +1031,9 @@ define void @testLowerToUHADD8h_c(<8 x i16> %src1, <8 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD8h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: uaddw.4s v2, v1, v0 -; CHECK-NEXT: uaddw2.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v1, v2, #1 -; CHECK-NEXT: shrn2.8h v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.8h v1, #10 +; CHECK-NEXT: uhadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %zextsrc1, @@ -1073,13 +1046,9 @@ define void @testLowerToUHADD4s_c(<4 x i32> %src1, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD4s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: uaddw.2d v2, v1, v0 -; CHECK-NEXT: uaddw2.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v1, v2, #1 -; CHECK-NEXT: shrn2.4s v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.4s v1, #10 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %add = add <4 x i64> %zextsrc1, @@ -1093,10 +1062,10 @@ define <8 x i8> @andmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: andmaskv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v2, #7 -; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: uaddw.8h v0, v0, v1 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = zext <8 x i8> %src2 to <8 x i16> @@ -1109,13 +1078,10 @@ define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) nounwind { ; CHECK-LABEL: andmaskv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v3, #7 +; CHECK-NEXT: movi.16b v3, #7 +; CHECK-NEXT: uzp1.16b v0, v0, v1 ; CHECK-NEXT: and.16b v0, v0, v3 -; CHECK-NEXT: and.16b v1, v1, v3 -; CHECK-NEXT: uaddw.8h v0, v0, v2 -; CHECK-NEXT: uaddw2.8h v1, v1, v2 -; CHECK-NEXT: shrn.8b v0, v0, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: uhadd.16b v0, v0, v2 ; CHECK-NEXT: ret %zextsrc1 = and <16 x i16> %src1, %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> @@ -1128,16 +1094,13 @@ define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) nounwind { ; CHECK-LABEL: andmask2v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v4, #7 -; CHECK-NEXT: movi.8h v5, #3 -; CHECK-NEXT: and.16b v0, v0, v4 -; CHECK-NEXT: and.16b v2, v2, v5 -; CHECK-NEXT: and.16b v1, v1, v4 -; CHECK-NEXT: and.16b v3, v3, v5 -; CHECK-NEXT: add.8h v0, v0, v2 -; CHECK-NEXT: add.8h v1, v1, v3 -; CHECK-NEXT: shrn.8b v0, v0, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: movi.16b v4, #3 +; CHECK-NEXT: movi.16b v5, #7 +; CHECK-NEXT: uzp1.16b v2, v2, v3 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: and.16b v1, v2, v4 +; CHECK-NEXT: and.16b v0, v0, v5 +; CHECK-NEXT: uhadd.16b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <16 x i16> %src1, %zextsrc2 = and <16 x i16> %src2, @@ -1150,11 +1113,11 @@ define <8 x i8> @andmask2v8i8(<8 x i16> %src1, <8 x i16> %src2) nounwind { ; CHECK-LABEL: andmask2v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v2, #7 -; CHECK-NEXT: bic.8h v1, #255, lsl #8 -; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: add.8h v0, v0, v1 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: xtn.8b v1, v1 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = and <8 x i16> %src2, @@ -1170,8 +1133,7 @@ ; CHECK-NEXT: movi.8h v2, #7 ; CHECK-NEXT: bic.8h v1, #254, lsl #8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: add.8h v0, v0, v1 -; CHECK-NEXT: ushr.8h v0, v0, #1 +; CHECK-NEXT: uhadd.8h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = and <8 x i16> %src2, @@ -1183,12 +1145,10 @@ define <16 x i8> @sextmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) nounwind { ; CHECK-LABEL: sextmaskv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v3, v2, #0 ; CHECK-NEXT: sshr.8h v1, v1, #11 -; CHECK-NEXT: ssra.8h v3, v0, #11 -; CHECK-NEXT: saddw2.8h v1, v1, v2 -; CHECK-NEXT: shrn.8b v0, v3, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: sshr.8h v0, v0, #11 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: shadd.16b v0, v0, v2 ; CHECK-NEXT: ret %sextsrc1 = ashr <16 x i16> %src1, %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -1201,9 +1161,9 @@ define <8 x i8> @sextmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmaskv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: ssra.8h v1, v0, #11 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: sshr.8h v0, v0, #11 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -1216,9 +1176,8 @@ define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmask2v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: ssra.8h v1, v0, #8 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: shrn.8b v0, v0, #8 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -1231,9 +1190,10 @@ define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmask3v8i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sshr.8h v0, v0, #7 ; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: usra.8h v1, v0, #7 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>