Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -614,6 +614,17 @@ MULHU, MULHS, + /// AVGFLOORS/AVGFLOORU - Halving add - Add two integers using an integer of + /// type i[N+1], halving the result by shifting it one bit right. + /// shr(add(ext(X), ext(Y)), 1) + AVGFLOORS, + AVGFLOORU, + /// AVGCEILS/AVGCEILU - Rounding halving add - Add two integers using an + /// integer of type i[N+2], add 1 and halve the result by shifting it one bit + /// right. shr(add(ext(X), ext(Y), 1), 1) + AVGCEILS, + AVGCEILU, + // ABDS/ABDU - Absolute difference - Return the absolute difference between // two numbers interpreted as signed/unsigned. // i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1) Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2505,6 +2505,10 @@ case ISD::FMAXNUM_IEEE: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: return true; default: return false; } Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -365,6 +365,10 @@ [SDNPCommutative, SDNPAssociative]>; def mulhs : SDNode<"ISD::MULHS" , SDTIntBinOp, [SDNPCommutative]>; def mulhu : SDNode<"ISD::MULHU" , SDTIntBinOp, [SDNPCommutative]>; +def avgfloors : SDNode<"ISD::AVGFLOORS" , SDTIntBinOp, [SDNPCommutative]>; +def avgflooru : SDNode<"ISD::AVGFLOORU" , SDTIntBinOp, [SDNPCommutative]>; +def avgceils : SDNode<"ISD::AVGCEILS" , SDTIntBinOp, [SDNPCommutative]>; +def avgceilu : SDNode<"ISD::AVGCEILU" , SDTIntBinOp, [SDNPCommutative]>; def abds : SDNode<"ISD::ABDS" , SDTIntBinOp, [SDNPCommutative]>; def abdu : SDNode<"ISD::ABDU" , SDTIntBinOp, [SDNPCommutative]>; def smullohi : SDNode<"ISD::SMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>; Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18380,6 +18380,15 @@ APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits()); + // Convert a truncating store of a extension into a standard store. + if ((Value.getOpcode() == ISD::ZERO_EXTEND || + Value.getOpcode() == ISD::SIGN_EXTEND || + Value.getOpcode() == ISD::ANY_EXTEND) && + Value.getOperand(0).getValueType() == ST->getMemoryVT() && + TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT())) + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getMemOperand()); + // See if we can simplify the input to this truncstore with knowledge that // only the low bits are being used. For example: // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3088,6 +3088,10 @@ case ISD::USHLSAT: case ISD::ROTL: case ISD::ROTR: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: Res = WidenVecRes_Binary(N, /*IsVP*/ false); break; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -231,6 +231,10 @@ case ISD::MUL: return "mul"; case ISD::MULHU: return "mulhu"; case ISD::MULHS: return "mulhs"; + case ISD::AVGFLOORU: return "avgflooru"; + case ISD::AVGFLOORS: return "avgfloors"; + case ISD::AVGCEILU: return "avgceilu"; + case ISD::AVGCEILS: return "avgceils"; case ISD::ABDS: return "abds"; case ISD::ABDU: return "abdu"; case ISD::SDIV: return "sdiv"; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -888,6 +888,113 @@ Depth); } +// Attempt to form ext(avgfloor(A, B)) from shr(add(ext(A), ext(B)), 1). +// or to form ext(avgceil(A, B)) from shr(add(ext(A), ext(B), 1), 1). +static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG, + const TargetLowering &TLI, + const APInt &DemandedBits) { + assert((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) && + "SRL or SRA node is required here!"); + // Is the right shift using an immediate value of 1? + ConstantSDNode *N1C = isConstOrConstSplat(Op.getOperand(1)); + if (!N1C || !N1C->isOne()) + return SDValue(); + + // We are looking for an avgfloor + // add(ext, ext) + // or one of these as a avgceil + // add(add(ext, ext), 1) + // add(add(ext, 1), ext) + // add(ext, add(ext, 1)) + SDValue Add = Op.getOperand(0); + if (Add.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue ExtOpA = Add.getOperand(0); + SDValue ExtOpB = Add.getOperand(1); + auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { + ConstantSDNode *ConstOp; + if ((ConstOp = isConstOrConstSplat(Op1)) && ConstOp->isOne()) { + ExtOpA = Op2; + ExtOpB = Op3; + return true; + } + if ((ConstOp = isConstOrConstSplat(Op2)) && ConstOp->isOne()) { + ExtOpA = Op1; + ExtOpB = Op3; + return true; + } + if ((ConstOp = isConstOrConstSplat(Op3)) && ConstOp->isOne()) { + ExtOpA = Op1; + ExtOpB = Op2; + return true; + } + return false; + }; + bool IsCeil = + (ExtOpA.getOpcode() == ISD::ADD && + MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) || + (ExtOpB.getOpcode() == ISD::ADD && + MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA)); + + // If the shift is signed (sra): + // - Needs >= 2 sign bit for both operands. + // - Needs >= 2 zero bits. + // If the shift is unsigned (srl): + // - Needs >= 1 zero bit for both operands. + // - Needs 1 demanded bit zero and >= 2 sign bits. + unsigned NumSignedA = DAG.ComputeNumSignBits(ExtOpA); + unsigned NumSignedB = DAG.ComputeNumSignBits(ExtOpB); + unsigned NumZeroA = DAG.computeKnownBits(ExtOpA).countMinLeadingZeros(); + unsigned NumZeroB = DAG.computeKnownBits(ExtOpB).countMinLeadingZeros(); + unsigned ShiftOpc = Op.getOpcode(); + bool IsSigned = false; + unsigned KnownBits; + if (ShiftOpc == ISD::SRA) { + if (NumSignedA >= 2 && NumSignedB >= 2) { + IsSigned = true; + KnownBits = std::min(NumSignedA, NumSignedB) - 1; + } else if (NumZeroA >= 2 && NumZeroB >= 2) { + IsSigned = false; + KnownBits = std::min(NumZeroA, NumZeroB); + } else + return SDValue(); + } else if (ShiftOpc == ISD::SRL) { + if (NumZeroA >= 1 && NumZeroB >= 1) { + IsSigned = false; + KnownBits = std::min(NumZeroA, NumZeroB); + } else if (NumSignedA >= 2 && NumSignedB >= 2 && + DemandedBits.isSignBitClear()) { + IsSigned = true; + KnownBits = std::min(NumSignedA, NumSignedB) - 1; + } else + return SDValue(); + } else + return SDValue(); + + unsigned AVGOpc = IsCeil ? (IsSigned ? ISD::AVGCEILS : ISD::AVGCEILU) + : (IsSigned ? ISD::AVGFLOORS : ISD::AVGFLOORU); + + // Find the smallest power-2 type that is legal for this vector size and + // operation, given the original type size and the number of known sign/zero + // bits. + EVT VT = Op.getValueType(); + unsigned MinWidth = + std::max(VT.getScalarSizeInBits() - KnownBits, 8); + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), PowerOf2Ceil(MinWidth)); + if (VT.isVector()) + NVT = VT.changeVectorElementType(NVT); + if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) + return SDValue(); + + SDLoc DL(Op); + SDValue ResultAVG = + DAG.getNode(AVGOpc, DL, NVT, DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpA), + DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpB)); + return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, + ResultAVG); +} + /// Look at Op. At this point, we know that only the OriginalDemandedBits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the @@ -1550,6 +1657,10 @@ SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); + // Try to match AVG patterns. + if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits)) + return TLO.CombineTo(Op, AVG); + if (const APInt *SA = TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) { unsigned ShAmt = SA->getZExtValue(); @@ -1616,6 +1727,10 @@ if (DemandedBits.isOne()) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); + // Try to match AVG patterns. + if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits)) + return TLO.CombineTo(Op, AVG); + if (const APInt *SA = TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) { unsigned ShAmt = SA->getZExtValue(); Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -816,6 +816,12 @@ setOperationAction(ISD::SUBC, VT, Expand); setOperationAction(ISD::SUBE, VT, Expand); + // Halving adds + setOperationAction(ISD::AVGFLOORS, VT, Expand); + setOperationAction(ISD::AVGFLOORU, VT, Expand); + setOperationAction(ISD::AVGCEILS, VT, Expand); + setOperationAction(ISD::AVGCEILU, VT, Expand); + // Absolute difference setOperationAction(ISD::ABDS, VT, Expand); setOperationAction(ISD::ABDU, VT, Expand); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -230,14 +230,6 @@ SADDV, UADDV, - // Vector halving addition - SHADD, - UHADD, - - // Vector rounding halving addition - SRHADD, - URHADD, - // Unsigned Add Long Pairwise UADDLP, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -913,7 +913,6 @@ setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::VECTOR_SPLICE); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::STORE); @@ -1085,6 +1084,10 @@ for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32}) { + setOperationAction(ISD::AVGFLOORS, VT, Legal); + setOperationAction(ISD::AVGFLOORU, VT, Legal); + setOperationAction(ISD::AVGCEILS, VT, Legal); + setOperationAction(ISD::AVGCEILU, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); } @@ -2073,10 +2076,6 @@ MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::UADDV) - MAKE_CASE(AArch64ISD::SRHADD) - MAKE_CASE(AArch64ISD::URHADD) - MAKE_CASE(AArch64ISD::SHADD) - MAKE_CASE(AArch64ISD::UHADD) MAKE_CASE(AArch64ISD::SDOT) MAKE_CASE(AArch64ISD::UDOT) MAKE_CASE(AArch64ISD::SMINV) @@ -4306,9 +4305,9 @@ IntNo == Intrinsic::aarch64_neon_shadd); bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || IntNo == Intrinsic::aarch64_neon_urhadd); - unsigned Opcode = - IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD) - : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD); + unsigned Opcode = IsSignedAdd + ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS) + : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU); return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } @@ -13989,89 +13988,6 @@ return SDValue(); } -// Attempt to form urhadd(OpA, OpB) from -// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)) -// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)). -// The original form of the first expression is -// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the -// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)). -// Before this function is called the srl will have been lowered to -// AArch64ISD::VLSHR. -// This pass can also recognize signed variants of the patterns that use sign -// extension instead of zero extension and form a srhadd(OpA, OpB) or a -// shadd(OpA, OpB) from them. -static SDValue -performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - // Since we are looking for a right shift by a constant value of 1 and we are - // operating on types at least 16 bits in length (sign/zero extended OpA and - // OpB, which are at least 8 bits), it follows that the truncate will always - // discard the shifted-in bit and therefore the right shift will be logical - // regardless of the signedness of OpA and OpB. - SDValue Shift = N->getOperand(0); - if (Shift.getOpcode() != AArch64ISD::VLSHR) - return SDValue(); - - // Is the right shift using an immediate value of 1? - uint64_t ShiftAmount = Shift.getConstantOperandVal(1); - if (ShiftAmount != 1) - return SDValue(); - - SDValue ExtendOpA, ExtendOpB; - SDValue ShiftOp0 = Shift.getOperand(0); - unsigned ShiftOp0Opc = ShiftOp0.getOpcode(); - if (ShiftOp0Opc == ISD::SUB) { - - SDValue Xor = ShiftOp0.getOperand(1); - if (Xor.getOpcode() != ISD::XOR) - return SDValue(); - - // Is the XOR using a constant amount of all ones in the right hand side? - uint64_t C; - if (!isAllConstantBuildVector(Xor.getOperand(1), C)) - return SDValue(); - - unsigned ElemSizeInBits = VT.getScalarSizeInBits(); - APInt CAsAPInt(ElemSizeInBits, C); - if (CAsAPInt != APInt::getAllOnes(ElemSizeInBits)) - return SDValue(); - - ExtendOpA = Xor.getOperand(0); - ExtendOpB = ShiftOp0.getOperand(0); - } else if (ShiftOp0Opc == ISD::ADD) { - ExtendOpA = ShiftOp0.getOperand(0); - ExtendOpB = ShiftOp0.getOperand(1); - } else - return SDValue(); - - unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); - unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); - if (!(ExtendOpAOpc == ExtendOpBOpc && - (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) - return SDValue(); - - // Is the result of the right shift being truncated to the same value type as - // the original operands, OpA and OpB? - SDValue OpA = ExtendOpA.getOperand(0); - SDValue OpB = ExtendOpB.getOperand(0); - EVT OpAVT = OpA.getValueType(); - assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); - if (!(VT == OpAVT && OpAVT == OpB.getValueType())) - return SDValue(); - - SDLoc DL(N); - bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; - bool IsRHADD = ShiftOp0Opc == ISD::SUB; - unsigned HADDOpc = IsSignExtend - ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD) - : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD); - SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB); - - return ResultHADD; -} - static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { switch (Opcode) { case ISD::FADD: @@ -14174,9 +14090,10 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); - // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted - // subvectors from the same original vectors. Combine these into a single - // [us]rhadd or [us]hadd that operates on the two original vectors. Example: + // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use + // extracted subvectors from the same original vectors. Combine these into a + // single [us]rhadd or [us]hadd that operates on the two original vectors. + // Example: // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), // extract_subvector (v16i8 OpB, // <0>))), @@ -14186,8 +14103,8 @@ // -> // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) if (N->getNumOperands() == 2 && N0Opc == N1Opc && - (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD || - N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) { + (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS || + N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); @@ -17478,8 +17395,6 @@ return performExtendCombine(N, DCI, DAG); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI, DAG); - case ISD::TRUNCATE: - return performVectorTruncateCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::INSERT_SUBVECTOR: Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -624,11 +624,6 @@ def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; -def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; -def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; -def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>; -def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>; - def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), [(abdu node:$lhs, node:$rhs), (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; @@ -4441,7 +4436,7 @@ defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; -defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>; +defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; @@ -4453,14 +4448,14 @@ defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", avgceils>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; -defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>; +defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; @@ -4470,7 +4465,7 @@ defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", avgceilu>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -249,9 +249,6 @@ SCALEFS, SCALEFS_RND, - // Unsigned Integer average. - AVG, - /// Integer horizontal add/sub. HADD, HSUB, Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -948,6 +948,8 @@ setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); @@ -1345,6 +1347,10 @@ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + if (HasInt256) { + setOperationAction(ISD::AVGCEILU, MVT::v16i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v32i8, Legal); + } setOperationAction(ISD::SMULO, MVT::v32i8, Custom); setOperationAction(ISD::UMULO, MVT::v32i8, Custom); @@ -1644,6 +1650,10 @@ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); + if (HasBWI) { + setOperationAction(ISD::AVGCEILU, MVT::v32i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v64i8, Legal); + } setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); @@ -31598,9 +31608,8 @@ Results.push_back(Res); return; } - case X86ISD::VPMADDWD: - case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/VPMADDWD by widening. + case X86ISD::VPMADDWD: { + // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -32832,7 +32841,6 @@ NODE_NAME_CASE(SCALEF_RND) NODE_NAME_CASE(SCALEFS) NODE_NAME_CASE(SCALEFS_RND) - NODE_NAME_CASE(AVG) NODE_NAME_CASE(MULHRS) NODE_NAME_CASE(SINT_TO_FP_RND) NODE_NAME_CASE(UINT_TO_FP_RND) @@ -33013,7 +33021,6 @@ bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { switch (Opcode) { // TODO: Add more X86ISD opcodes once we have test coverage. - case X86ISD::AVG: case X86ISD::PCMPEQ: case X86ISD::PMULDQ: case X86ISD::PMULUDQ: @@ -40394,7 +40401,6 @@ case X86ISD::UNPCKH: case X86ISD::BLENDI: // Integer ops. - case X86ISD::AVG: case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. @@ -47129,7 +47135,7 @@ /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient -/// X86ISD::AVG instruction. +/// ISD::AVGCEILU (AVG) instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { @@ -47192,7 +47198,7 @@ auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); + return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops); }; auto AVGSplitter = [&](std::array Ops) { @@ -47790,7 +47796,8 @@ St->getValue().getOperand(0).getValueType() == MVT::v16i16 && TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { - SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, + St->getValue().getOperand(0)); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } Index: llvm/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/lib/Target/X86/X86InstrAVX512.td +++ llvm/lib/Target/X86/X86InstrAVX512.td @@ -5039,7 +5039,7 @@ HasBWI, 1>; defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SchedWriteVecIMul, HasBWI, 1>, T8PD; -defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu, SchedWriteVecALU, HasBWI, 1>; defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, SchedWriteVecIMul, HasAVX512, 1>, T8PD; Index: llvm/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -287,7 +287,6 @@ SDTCisSameAs<2, 1>]>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; -def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; Index: llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/lib/Target/X86/X86InstrSSE.td +++ llvm/lib/Target/X86/X86InstrSSE.td @@ -3471,9 +3471,9 @@ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, +defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, +defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, SchedWriteVecIMul, 1, NoVLX>; Index: llvm/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -371,8 +371,8 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), @@ -818,8 +818,8 @@ X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1281,8 +1281,8 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), Index: llvm/test/CodeGen/AArch64/arm64-vhadd.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -705,8 +705,8 @@ define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl.4s v0, v0, v1 -; CHECK-NEXT: sshr.4s v0, v0, #1 +; CHECK-NEXT: shadd.4h v0, v0, v1 +; CHECK-NEXT: sshll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i16> %src1 to <4 x i32> %zextsrc2 = sext <4 x i16> %src2 to <4 x i32> @@ -718,8 +718,9 @@ define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.4s v0, v0, v1 -; CHECK-NEXT: ushr.4s v0, v0, #1 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: shadd.4s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -744,8 +745,8 @@ define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) nounwind { ; CHECK-LABEL: hadd16_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.4s v0, v0, v1 -; CHECK-NEXT: ushr.4s v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -759,10 +760,9 @@ define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl2.2d v2, v0, v1 -; CHECK-NEXT: saddl.2d v0, v0, v1 -; CHECK-NEXT: sshr.2d v1, v2, #1 -; CHECK-NEXT: sshr.2d v0, v0, #1 +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: sshll2.2d v1, v0, #0 +; CHECK-NEXT: sshll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i32> %src1 to <4 x i64> %zextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -774,10 +774,9 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl2.2d v2, v0, v1 -; CHECK-NEXT: uaddl.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v2, #1 -; CHECK-NEXT: ushr.2d v0, v0, #1 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -804,10 +803,9 @@ define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl2.2d v2, v0, v1 -; CHECK-NEXT: uaddl.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v2, #1 -; CHECK-NEXT: ushr.2d v0, v0, #1 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -838,8 +836,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: ushr.4h v0, v0, #1 +; CHECK-NEXT: shadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> %zextsrc2 = zext <4 x i8> %src2 to <4 x i16> @@ -884,9 +881,8 @@ define void @testLowerToSHADD8b_c(<8 x i8> %src1, <8 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD8b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: saddw.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v1, #10 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> @@ -900,9 +896,8 @@ define void @testLowerToSHADD4h_c(<4 x i16> %src1, <4 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD4h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: saddw.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: movi.4h v1, #10 +; CHECK-NEXT: shadd.4h v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> @@ -916,10 +911,8 @@ define void @testLowerToSHADD2s_c(<2 x i32> %src1, <2 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD2s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: saddw.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: movi.2s v1, #10 +; CHECK-NEXT: shadd.2s v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> @@ -933,12 +926,9 @@ define void @testLowerToSHADD16b_c(<16 x i8> %src1, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD16b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: saddw.8h v2, v1, v0 -; CHECK-NEXT: saddw2.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v1, v2, #1 -; CHECK-NEXT: shrn2.16b v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.16b v1, #10 +; CHECK-NEXT: shadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %add = add <16 x i16> %sextsrc1, @@ -951,12 +941,9 @@ define void @testLowerToSHADD8h_c(<8 x i16> %src1, <8 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD8h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: saddw.4s v2, v1, v0 -; CHECK-NEXT: saddw2.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v1, v2, #1 -; CHECK-NEXT: shrn2.8h v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.8h v1, #10 +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %sextsrc1, @@ -969,13 +956,9 @@ define void @testLowerToSHADD4s_c(<4 x i32> %src1, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToSHADD4s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: saddw.2d v2, v1, v0 -; CHECK-NEXT: saddw2.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v1, v2, #1 -; CHECK-NEXT: shrn2.4s v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.4s v1, #10 +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %add = add <4 x i64> %sextsrc1, @@ -988,9 +971,8 @@ define void @testLowerToUHADD8b_c(<8 x i8> %src1, <8 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD8b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: uaddw.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v1, #10 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i8> %src1 to <8 x i16> @@ -1004,9 +986,8 @@ define void @testLowerToUHADD4h_c(<4 x i16> %src1, <4 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD4h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: movi.4h v1, #10 +; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> @@ -1020,10 +1001,8 @@ define void @testLowerToUHADD2s_c(<2 x i32> %src1, <2 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD2s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: uaddw.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: movi.2s v1, #10 +; CHECK-NEXT: uhadd.2s v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i32> %src1 to <2 x i64> @@ -1037,12 +1016,9 @@ define void @testLowerToUHADD16b_c(<16 x i8> %src1, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD16b_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v1, #10 -; CHECK-NEXT: uaddw.8h v2, v1, v0 -; CHECK-NEXT: uaddw2.8h v0, v1, v0 -; CHECK-NEXT: shrn.8b v1, v2, #1 -; CHECK-NEXT: shrn2.16b v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.16b v1, #10 +; CHECK-NEXT: uhadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> %add = add <16 x i16> %zextsrc1, @@ -1055,12 +1031,9 @@ define void @testLowerToUHADD8h_c(<8 x i16> %src1, <8 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD8h_c: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: uaddw.4s v2, v1, v0 -; CHECK-NEXT: uaddw2.4s v0, v1, v0 -; CHECK-NEXT: shrn.4h v1, v2, #1 -; CHECK-NEXT: shrn2.8h v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.8h v1, #10 +; CHECK-NEXT: uhadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %add = add <8 x i32> %zextsrc1, @@ -1073,13 +1046,9 @@ define void @testLowerToUHADD4s_c(<4 x i32> %src1, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: testLowerToUHADD4s_c: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: dup.2d v1, x8 -; CHECK-NEXT: uaddw.2d v2, v1, v0 -; CHECK-NEXT: uaddw2.2d v0, v1, v0 -; CHECK-NEXT: shrn.2s v1, v2, #1 -; CHECK-NEXT: shrn2.4s v1, v0, #1 -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.4s v1, #10 +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %add = add <4 x i64> %zextsrc1, @@ -1093,10 +1062,10 @@ define <8 x i8> @andmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: andmaskv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v2, #7 -; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: uaddw.8h v0, v0, v1 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = zext <8 x i8> %src2 to <8 x i16> @@ -1109,13 +1078,10 @@ define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) nounwind { ; CHECK-LABEL: andmaskv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v3, #7 +; CHECK-NEXT: movi.16b v3, #7 +; CHECK-NEXT: uzp1.16b v0, v0, v1 ; CHECK-NEXT: and.16b v0, v0, v3 -; CHECK-NEXT: and.16b v1, v1, v3 -; CHECK-NEXT: uaddw.8h v0, v0, v2 -; CHECK-NEXT: uaddw2.8h v1, v1, v2 -; CHECK-NEXT: shrn.8b v0, v0, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: uhadd.16b v0, v0, v2 ; CHECK-NEXT: ret %zextsrc1 = and <16 x i16> %src1, %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> @@ -1128,16 +1094,13 @@ define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) nounwind { ; CHECK-LABEL: andmask2v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v4, #7 -; CHECK-NEXT: movi.8h v5, #3 -; CHECK-NEXT: and.16b v0, v0, v4 -; CHECK-NEXT: and.16b v2, v2, v5 -; CHECK-NEXT: and.16b v1, v1, v4 -; CHECK-NEXT: and.16b v3, v3, v5 -; CHECK-NEXT: add.8h v0, v0, v2 -; CHECK-NEXT: add.8h v1, v1, v3 -; CHECK-NEXT: shrn.8b v0, v0, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: movi.16b v4, #3 +; CHECK-NEXT: movi.16b v5, #7 +; CHECK-NEXT: uzp1.16b v2, v2, v3 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: and.16b v1, v2, v4 +; CHECK-NEXT: and.16b v0, v0, v5 +; CHECK-NEXT: uhadd.16b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <16 x i16> %src1, %zextsrc2 = and <16 x i16> %src2, @@ -1150,11 +1113,11 @@ define <8 x i8> @andmask2v8i8(<8 x i16> %src1, <8 x i16> %src2) nounwind { ; CHECK-LABEL: andmask2v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8h v2, #7 -; CHECK-NEXT: bic.8h v1, #255, lsl #8 -; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: add.8h v0, v0, v1 -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: xtn.8b v1, v1 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = and <8 x i16> %src2, @@ -1170,8 +1133,7 @@ ; CHECK-NEXT: movi.8h v2, #7 ; CHECK-NEXT: bic.8h v1, #254, lsl #8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: add.8h v0, v0, v1 -; CHECK-NEXT: ushr.8h v0, v0, #1 +; CHECK-NEXT: uhadd.8h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = and <8 x i16> %src1, %zextsrc2 = and <8 x i16> %src2, @@ -1183,12 +1145,10 @@ define <16 x i8> @sextmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) nounwind { ; CHECK-LABEL: sextmaskv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v3, v2, #0 ; CHECK-NEXT: sshr.8h v1, v1, #11 -; CHECK-NEXT: ssra.8h v3, v0, #11 -; CHECK-NEXT: saddw2.8h v1, v1, v2 -; CHECK-NEXT: shrn.8b v0, v3, #1 -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: sshr.8h v0, v0, #11 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: shadd.16b v0, v0, v2 ; CHECK-NEXT: ret %sextsrc1 = ashr <16 x i16> %src1, %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -1201,9 +1161,9 @@ define <8 x i8> @sextmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmaskv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: ssra.8h v1, v0, #11 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: sshr.8h v0, v0, #11 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -1216,9 +1176,8 @@ define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmask2v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: ssra.8h v1, v0, #8 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: shrn.8b v0, v0, #8 +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -1231,9 +1190,10 @@ define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) nounwind { ; CHECK-LABEL: sextmask3v8i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sshr.8h v0, v0, #7 ; CHECK-NEXT: sshll.8h v1, v1, #0 -; CHECK-NEXT: usra.8h v1, v0, #7 -; CHECK-NEXT: shrn.8b v0, v1, #1 +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: ret %sextsrc1 = ashr <8 x i16> %src1, %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> Index: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -543,17 +543,16 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 +; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1 ; VI-NEXT: s_endpgm %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 Index: llvm/test/CodeGen/X86/avg.ll =================================================================== --- llvm/test/CodeGen/X86/avg.ll +++ llvm/test/CodeGen/X86/avg.ll @@ -64,15 +64,15 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX-LABEL: avg_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a @@ -162,16 +162,16 @@ ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -313,8 +313,8 @@ ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -361,15 +361,15 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX-LABEL: avg_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a @@ -407,16 +407,16 @@ ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -489,8 +489,8 @@ ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -561,15 +561,15 @@ ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqu %ymm2, (%rax) +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %xmm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2645,8 +2645,8 @@ ; ; AVX-LABEL: PR52131_pavg_chain: ; AVX: # %bb.0: -; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> @@ -2665,96 +2665,15 @@ define <8 x i16> @PR52131_pavg_chainlike_but_not_zext(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; SSE2-LABEL: PR52131_pavg_chainlike_but_not_zext: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm5 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pslld $15, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $15, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: pavgw %xmm1, %xmm0 +; SSE2-NEXT: pavgw %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX1-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: PR52131_pavg_chainlike_but_not_zext: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: PR52131_pavg_chainlike_but_not_zext: +; AVX: # %bb.0: +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: retq %i = zext <8 x i16> %a to <8 x i32> %i1 = zext <8 x i16> %b to <8 x i32> %i2 = add nuw nsw <8 x i32> %i, @@ -2787,7 +2706,7 @@ ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2795,7 +2714,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2803,7 +2722,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpavgw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %i = and <8 x i32> %a, Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -72,8 +72,8 @@ define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" { ; CHECK-LABEL: avg_v64i8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 -; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq