diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12734,87 +12734,6 @@ return SDValue(); } -// Attempt to form one of the avg patterns from: -// truncate(shr(add(zext(OpB), zext(OpA)), 1)) -// Creating avgflooru/avgfloors/avgceilu/avgceils, with the ceiling having an -// extra rounding add: -// truncate(shr(add(zext(OpB), zext(OpA), 1), 1)) -// This starts at a truncate, meaning the shift will always be shl, as the top -// bits are known to not be demanded. -static SDValue performAvgCombine(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::TRUNCATE && "TRUNCATE node expected"); - EVT VT = N->getValueType(0); - - SDValue Shift = N->getOperand(0); - if (Shift.getOpcode() != ISD::SRL) - return SDValue(); - - // Is the right shift using an immediate value of 1? - ConstantSDNode *N1C = isConstOrConstSplat(Shift.getOperand(1)); - if (!N1C || !N1C->isOne()) - return SDValue(); - - // We are looking for an avgfloor - // add(ext, ext) - // or one of these as a avgceil - // add(add(ext, ext), 1) - // add(add(ext, 1), ext) - // add(ext, add(ext, 1)) - SDValue Add = Shift.getOperand(0); - if (Add.getOpcode() != ISD::ADD) - return SDValue(); - - SDValue ExtendOpA = Add.getOperand(0); - SDValue ExtendOpB = Add.getOperand(1); - auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { - ConstantSDNode *ConstOp; - if ((ConstOp = isConstOrConstSplat(Op1)) && ConstOp->isOne()) { - ExtendOpA = Op2; - ExtendOpB = Op3; - return true; - } - if ((ConstOp = isConstOrConstSplat(Op2)) && ConstOp->isOne()) { - ExtendOpA = Op1; - ExtendOpB = Op3; - return true; - } - if ((ConstOp = isConstOrConstSplat(Op3)) && ConstOp->isOne()) { - ExtendOpA = Op1; - ExtendOpB = Op2; - return true; - } - return false; - }; - bool IsCeil = (ExtendOpA.getOpcode() == ISD::ADD && - MatchOperands(ExtendOpA.getOperand(0), ExtendOpA.getOperand(1), - ExtendOpB)) || - (ExtendOpB.getOpcode() == ISD::ADD && - MatchOperands(ExtendOpB.getOperand(0), ExtendOpB.getOperand(1), - ExtendOpA)); - - unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); - unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); - if (!(ExtendOpAOpc == ExtendOpBOpc && - (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) - return SDValue(); - - // Is the result of the right shift being truncated to the same value type as - // the original operands, OpA and OpB? - SDValue OpA = ExtendOpA.getOperand(0); - SDValue OpB = ExtendOpB.getOperand(0); - EVT OpAVT = OpA.getValueType(); - if (VT != OpAVT || OpAVT != OpB.getValueType()) - return SDValue(); - - bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; - unsigned AVGOpc = IsSignExtend ? (IsCeil ? ISD::AVGCEILS : ISD::AVGFLOORS) - : (IsCeil ? ISD::AVGCEILU : ISD::AVGFLOORU); - if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(AVGOpc, VT)) - return SDValue(); - - return DAG.getNode(AVGOpc, SDLoc(N), VT, OpA, OpB); -} - SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -13101,8 +13020,6 @@ if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; - if (SDValue M = performAvgCombine(N, DAG)) - return M; // Narrow a suitable binary operation with a non-opaque constant operand by // moving it ahead of the truncate. This is limited to pre-legalization diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -907,6 +907,132 @@ Depth); } +// Attempt to form ext(avgfloor(A, B)) from shr(add(ext(A), ext(B)), 1). +// or to form ext(avgceil(A, B)) from shr(add(ext(A), ext(B), 1), 1). +static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG, + const TargetLowering &TLI, + const APInt &DemandedBits, + const APInt &DemandedElts, + unsigned Depth) { + assert((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) && + "SRL or SRA node is required here!"); + // Is the right shift using an immediate value of 1? + ConstantSDNode *N1C = isConstOrConstSplat(Op.getOperand(1), DemandedElts); + if (!N1C || !N1C->isOne()) + return SDValue(); + + // We are looking for an avgfloor + // add(ext, ext) + // or one of these as a avgceil + // add(add(ext, ext), 1) + // add(add(ext, 1), ext) + // add(ext, add(ext, 1)) + SDValue Add = Op.getOperand(0); + if (Add.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue ExtOpA = Add.getOperand(0); + SDValue ExtOpB = Add.getOperand(1); + auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { + ConstantSDNode *ConstOp; + if ((ConstOp = isConstOrConstSplat(Op1, DemandedElts)) && + ConstOp->isOne()) { + ExtOpA = Op2; + ExtOpB = Op3; + return true; + } + if ((ConstOp = isConstOrConstSplat(Op2, DemandedElts)) && + ConstOp->isOne()) { + ExtOpA = Op1; + ExtOpB = Op3; + return true; + } + if ((ConstOp = isConstOrConstSplat(Op3, DemandedElts)) && + ConstOp->isOne()) { + ExtOpA = Op1; + ExtOpB = Op2; + return true; + } + return false; + }; + bool IsCeil = + (ExtOpA.getOpcode() == ISD::ADD && + MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) || + (ExtOpB.getOpcode() == ISD::ADD && + MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA)); + + // If the shift is signed (sra): + // - Needs >= 2 sign bit for both operands. + // - Needs >= 2 zero bits. + // If the shift is unsigned (srl): + // - Needs >= 1 zero bit for both operands. + // - Needs 1 demanded bit zero and >= 2 sign bits. + unsigned ShiftOpc = Op.getOpcode(); + bool IsSigned = false; + unsigned KnownBits; + unsigned NumSignedA = DAG.ComputeNumSignBits(ExtOpA, DemandedElts, Depth); + unsigned NumSignedB = DAG.ComputeNumSignBits(ExtOpB, DemandedElts, Depth); + unsigned NumSigned = std::min(NumSignedA, NumSignedB) - 1; + unsigned NumZeroA = + DAG.computeKnownBits(ExtOpA, DemandedElts, Depth).countMinLeadingZeros(); + unsigned NumZeroB = + DAG.computeKnownBits(ExtOpB, DemandedElts, Depth).countMinLeadingZeros(); + unsigned NumZero = std::min(NumZeroA, NumZeroB); + + switch (ShiftOpc) { + default: + llvm_unreachable("Unexpected ShiftOpc in combineShiftToAVG"); + case ISD::SRA: { + if (NumZero >= 2 && NumSigned < NumZero) { + IsSigned = false; + KnownBits = NumZero; + break; + } + if (NumSigned >= 1) { + IsSigned = true; + KnownBits = NumSigned; + break; + } + return SDValue(); + } + case ISD::SRL: { + if (NumZero >= 1 && NumSigned < NumZero) { + IsSigned = false; + KnownBits = NumZero; + break; + } + if (NumSigned >= 1 && DemandedBits.isSignBitClear()) { + IsSigned = true; + KnownBits = NumSigned; + break; + } + return SDValue(); + } + } + + unsigned AVGOpc = IsCeil ? (IsSigned ? ISD::AVGCEILS : ISD::AVGCEILU) + : (IsSigned ? ISD::AVGFLOORS : ISD::AVGFLOORU); + + // Find the smallest power-2 type that is legal for this vector size and + // operation, given the original type size and the number of known sign/zero + // bits. + EVT VT = Op.getValueType(); + unsigned MinWidth = + std::max(VT.getScalarSizeInBits() - KnownBits, 8); + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), PowerOf2Ceil(MinWidth)); + if (VT.isVector()) + NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount()); + if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) + return SDValue(); + + SDLoc DL(Op); + SDValue ResultAVG = + DAG.getNode(AVGOpc, DL, NVT, DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpA), + DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpB)); + return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, + ResultAVG); +} + /// Look at Op. At this point, we know that only the OriginalDemandedBits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the @@ -1569,6 +1695,11 @@ SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); + // Try to match AVG patterns. + if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits, + DemandedElts, Depth + 1)) + return TLO.CombineTo(Op, AVG); + if (const APInt *SA = TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) { unsigned ShAmt = SA->getZExtValue(); @@ -1635,6 +1766,11 @@ if (DemandedBits.isOne()) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); + // Try to match AVG patterns. + if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits, + DemandedElts, Depth + 1)) + return TLO.CombineTo(Op, AVG); + if (const APInt *SA = TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) { unsigned ShAmt = SA->getZExtValue(); diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -705,8 +705,8 @@ define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl.4s v0, v0, v1 -; CHECK-NEXT: sshr.4s v0, v0, #1 +; CHECK-NEXT: shadd.4h v0, v0, v1 +; CHECK-NEXT: sshll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i16> %src1 to <4 x i32> %zextsrc2 = sext <4 x i16> %src2 to <4 x i32> @@ -718,8 +718,8 @@ define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.4s v0, v0, v1 -; CHECK-NEXT: ushr.4s v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -744,8 +744,8 @@ define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.4s v0, v0, v1 -; CHECK-NEXT: ushr.4s v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -759,10 +759,9 @@ define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl2.2d v2, v0, v1 -; CHECK-NEXT: saddl.2d v0, v0, v1 -; CHECK-NEXT: sshr.2d v1, v2, #1 -; CHECK-NEXT: sshr.2d v0, v0, #1 +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: sshll2.2d v1, v0, #0 +; CHECK-NEXT: sshll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i32> %src1 to <4 x i64> %zextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -774,10 +773,9 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl2.2d v2, v0, v1 -; CHECK-NEXT: uaddl.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v2, #1 -; CHECK-NEXT: ushr.2d v0, v0, #1 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -804,10 +802,9 @@ define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl2.2d v2, v0, v1 -; CHECK-NEXT: uaddl.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v2, #1 -; CHECK-NEXT: ushr.2d v0, v0, #1 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -884,9 +881,8 @@ define void @testLowerToSHADD8b_c(<8 x i8> %src1, <8 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD8b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: saddw.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v1, #10 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> @@ -900,9 +896,8 @@ define void @testLowerToSHADD4h_c(<4 x i16> %src1, <4 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD4h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: saddw.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: movi.4h v1, #10 +; CHECK-NEXT: shadd.4h v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> @@ -916,10 +911,8 @@ define void @testLowerToSHADD2s_c(<2 x i32> %src1, <2 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD2s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: saddw.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: movi.2s v1, #10 +; CHECK-NEXT: shadd.2s v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> @@ -933,12 +926,9 @@ define void @testLowerToSHADD16b_c(<16 x i8> %src1, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD16b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: saddw.8h v2, v1, v0 -; CHECK-NEXT: saddw2.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v1, v2, #1 -; CHECK-NEXT: shrn2.16b v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.16b v1, #10 +; CHECK-NEXT: shadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %add = add <16 x i16> %sextsrc1, @@ -951,12 +941,9 @@ define void @testLowerToSHADD8h_c(<8 x i16> %src1, <8 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD8h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: saddw.4s v2, v1, v0 -; CHECK-NEXT: saddw2.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v1, v2, #1 -; CHECK-NEXT: shrn2.8h v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.8h v1, #10 +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %sextsrc1, @@ -969,13 +956,9 @@ define void @testLowerToSHADD4s_c(<4 x i32> %src1, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD4s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: saddw.2d v2, v1, v0 -; CHECK-NEXT: saddw2.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v1, v2, #1 -; CHECK-NEXT: shrn2.4s v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.4s v1, #10 +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %add = add <4 x i64> %sextsrc1, @@ -988,9 +971,8 @@ define void @testLowerToUHADD8b_c(<8 x i8> %src1, <8 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD8b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: uaddw.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v1, #10 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i8> %src1 to <8 x i16> @@ -1004,9 +986,8 @@ define void @testLowerToUHADD4h_c(<4 x i16> %src1, <4 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD4h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: movi.4h v1, #10 +; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> @@ -1020,10 +1001,8 @@ define void @testLowerToUHADD2s_c(<2 x i32> %src1, <2 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD2s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: uaddw.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: movi.2s v1, #10 +; CHECK-NEXT: uhadd.2s v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i32> %src1 to <2 x i64> @@ -1037,12 +1016,9 @@ define void @testLowerToUHADD16b_c(<16 x i8> %src1, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD16b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: uaddw.8h v2, v1, v0 -; CHECK-NEXT: uaddw2.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v1, v2, #1 -; CHECK-NEXT: shrn2.16b v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.16b v1, #10 +; CHECK-NEXT: uhadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> %add = add <16 x i16> %zextsrc1, @@ -1055,12 +1031,9 @@ define void @testLowerToUHADD8h_c(<8 x i16> %src1, <8 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD8h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: uaddw.4s v2, v1, v0 -; CHECK-NEXT: uaddw2.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v1, v2, #1 -; CHECK-NEXT: shrn2.8h v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.8h v1, #10 +; CHECK-NEXT: uhadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %zextsrc1, @@ -1073,13 +1046,9 @@ define void @testLowerToUHADD4s_c(<4 x i32> %src1, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD4s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: uaddw.2d v2, v1, v0 -; CHECK-NEXT: uaddw2.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v1, v2, #1 -; CHECK-NEXT: shrn2.4s v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.4s v1, #10 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %add = add <4 x i64> %zextsrc1, @@ -1093,10 +1062,10 @@ define <8 x i8> @andmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: andmaskv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v2, #7 -; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: uaddw.8h v0, v0, v1 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = zext <8 x i8> %src2 to <8 x i16> @@ -1109,13 +1078,10 @@ define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) nounwind { ; CHECK-LABEL: andmaskv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v3, #7 +; CHECK-NEXT: movi.16b v3, #7 +; CHECK-NEXT: uzp1.16b v0, v0, v1 ; CHECK-NEXT: and.16b v0, v0, v3 -; CHECK-NEXT: and.16b v1, v1, v3 -; CHECK-NEXT: uaddw.8h v0, v0, v2 -; CHECK-NEXT: uaddw2.8h v1, v1, v2 -; CHECK-NEXT: shrn.8b v0, v0, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: uhadd.16b v0, v0, v2 ; CHECK-NEXT: ret %zextsrc1 = and <16 x i16> %src1, %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> @@ -1128,16 +1094,13 @@ define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) nounwind { ; CHECK-LABEL: andmask2v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v4, #7 -; CHECK-NEXT: movi.8h v5, #3 -; CHECK-NEXT: and.16b v0, v0, v4 -; CHECK-NEXT: and.16b v2, v2, v5 -; CHECK-NEXT: and.16b v1, v1, v4 -; CHECK-NEXT: and.16b v3, v3, v5 -; CHECK-NEXT: add.8h v0, v0, v2 -; CHECK-NEXT: add.8h v1, v1, v3 -; CHECK-NEXT: shrn.8b v0, v0, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: movi.16b v4, #3 +; CHECK-NEXT: movi.16b v5, #7 +; CHECK-NEXT: uzp1.16b v2, v2, v3 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: and.16b v1, v2, v4 +; CHECK-NEXT: and.16b v0, v0, v5 +; CHECK-NEXT: uhadd.16b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <16 x i16> %src1, %zextsrc2 = and <16 x i16> %src2, @@ -1150,11 +1113,11 @@ define <8 x i8> @andmask2v8i8(<8 x i16> %src1, <8 x i16> %src2) nounwind { ; CHECK-LABEL: andmask2v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v2, #7 -; CHECK-NEXT: bic.8h v1, #255, lsl #8 -; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: add.8h v0, v0, v1 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: xtn.8b v1, v1 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = and <8 x i16> %src2, @@ -1170,8 +1133,7 @@ ; CHECK-NEXT: movi.8h v2, #7 ; CHECK-NEXT: bic.8h v1, #254, lsl #8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: add.8h v0, v0, v1 -; CHECK-NEXT: ushr.8h v0, v0, #1 +; CHECK-NEXT: uhadd.8h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = and <8 x i16> %src2, @@ -1183,12 +1145,10 @@ define <16 x i8> @sextmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) nounwind { ; CHECK-LABEL: sextmaskv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v3, v2, #0 ; CHECK-NEXT: sshr.8h v1, v1, #11 -; CHECK-NEXT: ssra.8h v3, v0, #11 -; CHECK-NEXT: saddw2.8h v1, v1, v2 -; CHECK-NEXT: shrn.8b v0, v3, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: sshr.8h v0, v0, #11 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: shadd.16b v0, v0, v2 ; CHECK-NEXT: ret %sextsrc1 = ashr <16 x i16> %src1, %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -1201,9 +1161,9 @@ define <8 x i8> @sextmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmaskv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: ssra.8h v1, v0, #11 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: sshr.8h v0, v0, #11 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -1216,9 +1176,8 @@ define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmask2v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: ssra.8h v1, v0, #8 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: shrn.8b v0, v0, #8 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -1231,9 +1190,10 @@ define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmask3v8i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sshr.8h v0, v0, #7 ; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: usra.8h v1, v0, #7 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll --- a/llvm/test/CodeGen/AArch64/hadd-combine.ll +++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll @@ -17,11 +17,8 @@ define <8 x i16> @haddu_const(<8 x i16> %src1) { ; CHECK-LABEL: haddu_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: uaddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %zextsrc1, @@ -33,11 +30,8 @@ define <8 x i16> @haddu_const_lhs(<8 x i16> %src1) { ; CHECK-LABEL: haddu_const_lhs: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: uaddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> , %zextsrc1 @@ -188,11 +182,8 @@ define <8 x i16> @hadds_const(<8 x i16> %src1) { ; CHECK-LABEL: hadds_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: saddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %zextsrc1, @@ -204,11 +195,8 @@ define <8 x i16> @hadds_const_lhs(<8 x i16> %src1) { ; CHECK-LABEL: hadds_const_lhs: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: saddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> , %zextsrc1 @@ -362,11 +350,8 @@ define <8 x i16> @rhaddu_const(<8 x i16> %src1) { ; CHECK-LABEL: rhaddu_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #2 -; CHECK-NEXT: uaddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add1 = add <8 x i32> %zextsrc1, @@ -379,11 +364,8 @@ define <8 x i16> @rhaddu_const_lhs(<8 x i16> %src1) { ; CHECK-LABEL: rhaddu_const_lhs: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #2 -; CHECK-NEXT: uaddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add1 = add <8 x i32> , %zextsrc1 @@ -396,11 +378,8 @@ define <8 x i16> @rhaddu_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: rhaddu_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: uaddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add1 = add <8 x i32> , %zextsrc1 @@ -439,11 +418,8 @@ define <8 x i16> @rhaddu_undef(<8 x i16> %src1) { ; CHECK-LABEL: rhaddu_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: uaddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %zextsrc2 = zext <8 x i16> undef to <8 x i32> @@ -543,11 +519,8 @@ define <8 x i16> @rhadds_const(<8 x i16> %src1) { ; CHECK-LABEL: rhadds_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #2 -; CHECK-NEXT: saddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add1 = add <8 x i32> %zextsrc1, @@ -560,11 +533,8 @@ define <8 x i16> @rhadds_const_lhs(<8 x i16> %src1) { ; CHECK-LABEL: rhadds_const_lhs: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #2 -; CHECK-NEXT: saddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add1 = add <8 x i32> , %zextsrc1 @@ -577,11 +547,8 @@ define <8 x i16> @rhadds_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: rhadds_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: saddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add1 = add <8 x i32> , %zextsrc1 @@ -620,11 +587,8 @@ define <8 x i16> @rhadds_undef(<8 x i16> %src1) { ; CHECK-LABEL: rhadds_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: saddw v2.4s, v1.4s, v0.4h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h -; CHECK-NEXT: shrn v0.4h, v2.4s, #1 -; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %zextsrc1 = sext <8 x i16> %src1 to <8 x i32> %zextsrc2 = sext <8 x i16> undef to <8 x i32> diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -69,26 +69,12 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX1-LABEL: avg_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: retq -; -; AVX2-LABEL: avg_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; -; AVX512-LABEL: avg_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu %xmm0, (%rax) -; AVX512-NEXT: retq +; AVX-LABEL: avg_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -429,8 +415,8 @@ ; ; AVX512-LABEL: avg_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -2660,7 +2646,7 @@ ; AVX-LABEL: PR52131_pavg_chain: ; AVX: # %bb.0: ; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> @@ -2679,95 +2665,15 @@ define <8 x i16> @PR52131_pavg_chainlike_but_not_zext(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; SSE2-LABEL: PR52131_pavg_chainlike_but_not_zext: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm5 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pslld $15, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $15, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: pavgw %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX1-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: PR52131_pavg_chainlike_but_not_zext: +; AVX: # %bb.0: +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> %i2 = add nuw nsw <8 x i32> %i, @@ -2798,7 +2704,7 @@ ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2806,7 +2712,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2814,7 +2720,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %i = and <8 x i32> %a,