Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -513,15 +513,6 @@ // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_sse2_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, - llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">, @@ -793,13 +784,6 @@ [IntrNoMem]>; } -// Vector pack -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_packusdw : GCCBuiltin<"__builtin_ia32_packusdw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem]>; -} - // Vector insert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_insertps : GCCBuiltin<"__builtin_ia32_insertps128">, @@ -1801,22 +1785,6 @@ llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; } -// Pack ops. -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_packsswb : GCCBuiltin<"__builtin_ia32_packsswb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; - def int_x86_avx2_packssdw : GCCBuiltin<"__builtin_ia32_packssdw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; - def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; - def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; -} - // Horizontal arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw256">, @@ -3758,22 +3726,6 @@ llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>; } -// Pack ops. -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512">, - Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty], - [IntrNoMem]>; - def int_x86_avx512_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512">, - Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty], - [IntrNoMem]>; - def int_x86_avx512_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512">, - Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty], - [IntrNoMem]>; - def int_x86_avx512_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512">, - Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty], - [IntrNoMem]>; -} - // Vector convert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_mask_cvtdq2ps_128 : // TODO: remove this intrinsic Index: llvm/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/lib/IR/AutoUpgrade.cpp +++ llvm/lib/IR/AutoUpgrade.cpp @@ -298,7 +298,11 @@ Name.startswith("avx512.ptestnm") || //Added in 6.0 Name.startswith("sse2.pavg") || // Added in 6.0 Name.startswith("avx2.pavg") || // Added in 6.0 - Name.startswith("avx512.mask.pavg")) // Added in 6.0 + Name.startswith("avx512.mask.pavg") || // Added in 6.0 + Name.startswith("sse2.pack") || // Added in 7.0 + Name.startswith("sse41.pack") || // Added in 7.0 + Name.startswith("avx2.pack") || // Added in 7.0 + Name.startswith("avx512.pack")) // Added in 7.0 return true; return false; @@ -1024,6 +1028,45 @@ return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2"); } +static Value *EmitX86Pack(IRBuilder<> &Builder, CallInst &CI, bool IsUnsigned, + int EltSize) { + Value *A = CI.getArgOperand(0); + Value *B = CI.getArgOperand(1); + + Type *Ty = A->getType(); + APInt MinVal, MaxVal; + if (IsUnsigned) { + MinVal = APInt::getMinValue(EltSize / 2).zext(EltSize); + MaxVal = APInt::getMaxValue(EltSize / 2).zext(EltSize); + } else { + MinVal = APInt::getSignedMinValue(EltSize / 2).sext(EltSize); + MaxVal = APInt::getSignedMaxValue(EltSize / 2).sext(EltSize); + } + + SmallVector ShuffleMask; + unsigned NumElts = Ty->getVectorNumElements(); + unsigned NumLanes = NumElts * Ty->getScalarSizeInBits() / 128; + unsigned NumEltsPerLane = 128 / EltSize; + + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt) + ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane)); + for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt) + ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane) + NumElts); + } + + Value *Res = Builder.CreateShuffleVector(A, B, ShuffleMask); + Type *RTy = Res->getType(); + Value *MinVec = ConstantInt::get(RTy, MinVal); + Value *MaxVec = ConstantInt::get(RTy, MaxVal); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SLT, Res, MaxVec); + Res = Builder.CreateSelect(Cmp, Res, MaxVec); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Res, MinVec); + Res = Builder.CreateSelect(Cmp, Res, MinVec); + Type *VTy = VectorType::get(Builder.getIntNTy(EltSize / 2), NumElts * 2); + return Builder.CreateTrunc(Res, VTy); +} + // Replace intrinsic with unmasked version and a select. static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, CallInst &CI, Value *&Rep) { @@ -1108,42 +1151,12 @@ IID = Intrinsic::x86_avx512_pmaddubs_w_512; else llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("packsswb.")) { - if (VecWidth == 128) - IID = Intrinsic::x86_sse2_packsswb_128; - else if (VecWidth == 256) - IID = Intrinsic::x86_avx2_packsswb; - else if (VecWidth == 512) - IID = Intrinsic::x86_avx512_packsswb_512; - else - llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("packssdw.")) { - if (VecWidth == 128) - IID = Intrinsic::x86_sse2_packssdw_128; - else if (VecWidth == 256) - IID = Intrinsic::x86_avx2_packssdw; - else if (VecWidth == 512) - IID = Intrinsic::x86_avx512_packssdw_512; - else - llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("packuswb.")) { - if (VecWidth == 128) - IID = Intrinsic::x86_sse2_packuswb_128; - else if (VecWidth == 256) - IID = Intrinsic::x86_avx2_packuswb; - else if (VecWidth == 512) - IID = Intrinsic::x86_avx512_packuswb_512; - else - llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("packusdw.")) { - if (VecWidth == 128) - IID = Intrinsic::x86_sse41_packusdw; - else if (VecWidth == 256) - IID = Intrinsic::x86_avx2_packusdw; - else if (VecWidth == 512) - IID = Intrinsic::x86_avx512_packusdw_512; - else - llvm_unreachable("Unexpected intrinsic"); + } else if (Name.startswith("pack")) { + bool IsUnsigned = Name[4] == 'u'; + int EltSize = (Name[6] == 'd') ? 32 : 16; + Rep = EmitX86Pack(Builder, CI, IsUnsigned, EltSize); + Rep = EmitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2)); + return true; } else if (Name.startswith("vpermilvar.")) { if (VecWidth == 128 && EltWidth == 32) IID = Intrinsic::x86_avx_vpermilvar_ps; @@ -2100,6 +2113,21 @@ { CI->getArgOperand(0), Builder.getInt1(false) }); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); + } else if (IsX86 && + (Name.startswith("sse2.pack") || Name.startswith("sse41.pack") || + Name.startswith("avx2.pack") || + Name.startswith("avx512.pack"))) { + int L = Name.size(); + L = (Name[L - 4] == '.') ? L - 4 : L; + bool IsUnsigned = Name[L - 4] == 'u'; + int EltSize = (Name[L - 2] == 'd') ? 32 : 16; + Rep = EmitX86Pack(Builder, *CI, IsUnsigned, EltSize); + } else if (IsX86 && Name.startswith("avx512.mask.pack")) { + bool IsUnsigned = Name[16] == 'u'; + int EltSize = (Name[18] == 'd') ? 32 : 16; + Rep = EmitX86Pack(Builder, *CI, IsUnsigned, EltSize); + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, + CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.psll")) { bool IsImmediate = Name[16] == 'i' || (Name.size() > 18 && Name[18] == 'i'); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34692,6 +34692,77 @@ return detectUSatPattern(In, VT); } +/// Detect a pattern of shuffling word or dword elements of two vectors together +/// in lanes of 256 bits to be compressed to 128 bits by PACK intructions and +/// return the original vectors in parameters A and B. +static bool tracePackVectorShuffle(SDValue SatVal, EVT VT, + const X86Subtarget &Subtarget, + bool IsUnsigned, SDValue &A, SDValue &B) { + // A 128 bit PACK op receives just a concat of its inputs. + if (VT.getSizeInBits() == 128) { + if (!Subtarget.hasSSE2()) + return false; + if (SatVal.getOpcode() == ISD::CONCAT_VECTORS && + SatVal.getNumOperands() == 2) { + A = SatVal.getOperand(0); + B = SatVal.getOperand(1); + // Check that this isn't a PACKUSDW pattern without SSE4.1. + if (IsUnsigned && A.getValueType().getScalarType() == MVT::i32 && + !Subtarget.hasSSE41()) + return false; + return true; + } + return false; + } + + // Check for AVX2 and AVX512 features. + if (VT.getSizeInBits() == 256 && !Subtarget.hasAVX2()) + return false; + if (VT.getSizeInBits() == 512 && !Subtarget.useBWIRegs()) + return false; + + // Check that the pattern is a shuffle of two vectors, both of which are + // the original inputs expanded to the same number of elements as the output + // through concatenation. + if (SatVal.getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + auto Shuffle = cast(SatVal.getNode()); + A = Shuffle->getOperand(0); + B = Shuffle->getOperand(1); + // Cases where A == B get optimized to a distinct unary pattern. + bool IsUnary = false; + if (B.isUndef()) { + IsUnary = true; + B = A; + } + if (A.getOpcode() != ISD::CONCAT_VECTORS || A.getNumOperands() != 2 || + B.getOpcode() != ISD::CONCAT_VECTORS || B.getNumOperands() != 2) + return false; + // Get the original inputs of the pattern. + A = A.getOperand(0); + B = B.getOperand(0); + + // Check the shuffle mask. createPackShuffleMask is not used here because it + // skips the odd-numbered elements of each lane in each input. + SmallVector ShuffleMask; + EVT InVT = A.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned Offset = IsUnary ? 0 : VT.getVectorNumElements(); + unsigned NumLanes = InVT.getSizeInBits() / 128; + unsigned NumEltsPerLane = 128 / InVT.getScalarSizeInBits(); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt) + ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane)); + for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt) + ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); + } + + for (unsigned i = 0; i < NumElts; ++i) + if (Shuffle->getMaskElt(i) != ShuffleMask[i]) + return false; + return true; +} + static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -34699,6 +34770,28 @@ EVT InVT = In.getValueType(); EVT InSVT = InVT.getScalarType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // A special case for a full PACK pattern, detecting the vector concatenation + // (if 128 bit) or lane shuffle (other cases). + // FIXME: the output pattern is basically the same as others, being truncation + // of saturation, but the need to generate VTRUNC over more generic patterns + // in truncateVectorWithPACK means that we are replicating checks here. + if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && + ((SVT == MVT::i8 && InSVT == MVT::i16) || + (SVT == MVT::i16 && InSVT == MVT::i32))) { + unsigned Opcode = 0; + SDValue SatVal; + if (auto SSatVal = detectSSatPattern(In, VT)) { + Opcode = X86ISD::PACKSS; + SatVal = SSatVal; + } else if (auto USatVal = detectSSatPattern(In, VT, true)) { + Opcode = X86ISD::PACKUS; + SatVal = USatVal; + } + SDValue A, B; + if (Opcode && tracePackVectorShuffle(SatVal, VT, Subtarget, + (Opcode == X86ISD::PACKUS), A, B)) + return DAG.getNode(Opcode, DL, VT, A, B); + } if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) && isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) { if (auto SSatVal = detectSSatPattern(In, VT)) Index: llvm/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -398,10 +398,6 @@ X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), @@ -1438,10 +1434,6 @@ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), - X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_pmaddubs_w_512, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), X86_INTRINSIC_DATA(avx512_pmaddw_d_512, INTR_TYPE_2OP, @@ -1599,9 +1591,6 @@ X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse2_min_sd, INTR_TYPE_2OP, X86ISD::FMINS, 0), X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), - X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), @@ -1645,7 +1634,6 @@ X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), - X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -566,81 +566,6 @@ return Builder.CreateAShr(Vec, ShiftVec); } -static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Type *ResTy = II.getType(); - - // Fast all undef handling. - if (isa(Arg0) && isa(Arg1)) - return UndefValue::get(ResTy); - - Type *ArgTy = Arg0->getType(); - unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; - unsigned NumDstElts = ResTy->getVectorNumElements(); - unsigned NumSrcElts = ArgTy->getVectorNumElements(); - assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types"); - - unsigned NumDstEltsPerLane = NumDstElts / NumLanes; - unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; - unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); - assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) && - "Unexpected packing types"); - - // Constant folding. - auto *Cst0 = dyn_cast(Arg0); - auto *Cst1 = dyn_cast(Arg1); - if (!Cst0 || !Cst1) - return nullptr; - - SmallVector Vals; - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { - unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; - auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0; - auto *COp = Cst->getAggregateElement(SrcIdx); - if (COp && isa(COp)) { - Vals.push_back(UndefValue::get(ResTy->getScalarType())); - continue; - } - - auto *CInt = dyn_cast_or_null(COp); - if (!CInt) - return nullptr; - - APInt Val = CInt->getValue(); - assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() && - "Unexpected constant bitwidth"); - - if (IsSigned) { - // PACKSS: Truncate signed value with signed saturation. - // Source values less than dst minint are saturated to minint. - // Source values greater than dst maxint are saturated to maxint. - if (Val.isSignedIntN(DstScalarSizeInBits)) - Val = Val.trunc(DstScalarSizeInBits); - else if (Val.isNegative()) - Val = APInt::getSignedMinValue(DstScalarSizeInBits); - else - Val = APInt::getSignedMaxValue(DstScalarSizeInBits); - } else { - // PACKUS: Truncate signed value with unsigned saturation. - // Source values less than zero are saturated to zero. - // Source values greater than dst maxuint are saturated to maxuint. - if (Val.isIntN(DstScalarSizeInBits)) - Val = Val.trunc(DstScalarSizeInBits); - else if (Val.isNegative()) - Val = APInt::getNullValue(DstScalarSizeInBits); - else - Val = APInt::getAllOnesValue(DstScalarSizeInBits); - } - - Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val)); - } - } - - return ConstantVector::get(Vals); -} - static Value *simplifyX86movmsk(const IntrinsicInst &II) { Value *Arg = II.getArgOperand(0); Type *ResTy = II.getType(); @@ -2593,26 +2518,6 @@ return replaceInstUsesWith(*II, V); break; - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - if (Value *V = simplifyX86pack(*II, true)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: - if (Value *V = simplifyX86pack(*II, false)) - return replaceInstUsesWith(*II, V); - break; - case Intrinsic::x86_pclmulqdq: { if (auto *C = dyn_cast(II->getArgOperand(2))) { unsigned Imm = C->getZExtValue(); Index: llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1436,63 +1436,6 @@ break; - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: { - auto *Ty0 = II->getArgOperand(0)->getType(); - unsigned InnerVWidth = Ty0->getVectorNumElements(); - assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); - - unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; - unsigned VWidthPerLane = VWidth / NumLanes; - unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; - - // Per lane, pack the elements of the first input and then the second. - // e.g. - // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) - // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) - for (int OpNum = 0; OpNum != 2; ++OpNum) { - APInt OpDemandedElts(InnerVWidth, 0); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - unsigned LaneIdx = Lane * VWidthPerLane; - for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { - unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; - if (DemandedElts[Idx]) - OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); - } - } - - // Demand elements from the operand. - auto *Op = II->getArgOperand(OpNum); - APInt OpUndefElts(InnerVWidth, 0); - TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts, - Depth + 1); - if (TmpV) { - II->setArgOperand(OpNum, TmpV); - MadeChange = true; - } - - // Pack the operand's UNDEF elements, one lane at a time. - OpUndefElts = OpUndefElts.zext(VWidth); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); - LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); - LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); - UndefElts |= LaneElts; - } - } - break; - } - // PSHUFB case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: Index: llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp =================================================================== --- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2374,22 +2374,6 @@ // intrinsic. Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) { switch (id) { - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_sse2_packuswb_128: - return Intrinsic::x86_sse2_packsswb_128; - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse41_packusdw: - return Intrinsic::x86_sse2_packssdw_128; - - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx2_packuswb: - return Intrinsic::x86_avx2_packsswb; - - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packusdw: - return Intrinsic::x86_avx2_packssdw; - case Intrinsic::x86_mmx_packsswb: case Intrinsic::x86_mmx_packuswb: return Intrinsic::x86_mmx_packsswb; @@ -2674,17 +2658,6 @@ handleVectorShiftIntrinsic(I, /* Variable */ true); break; - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx2_packusdw: - handleVectorPackIntrinsic(I); - break; - case Intrinsic::x86_mmx_packsswb: case Intrinsic::x86_mmx_packuswb: handleVectorPackIntrinsic(I, 16); Index: llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1936,11 +1936,15 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1) + %1 = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <32 x i32> + %2 = icmp slt <32 x i16> %1, + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> + %4 = icmp sgt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %call = trunc <32 x i16> %5 to <32 x i8> %res = bitcast <32 x i8> %call to <4 x i64> ret <4 x i64> %res } -declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_packs_epi32: @@ -1949,11 +1953,15 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1) + %1 = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <16 x i32> + %2 = icmp slt <16 x i32> %1, + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> + %4 = icmp sgt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %call = trunc <16 x i32> %5 to <16 x i16> %res = bitcast <16 x i16> %call to <4 x i64> ret <4 x i64> %res } -declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_packus_epi16: @@ -1962,11 +1970,15 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1) + %1 = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <32 x i32> + %2 = icmp slt <32 x i16> %1, + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> + %4 = icmp sgt <32 x i16> %3, zeroinitializer + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer + %call = trunc <32 x i16> %5 to <32 x i8> %res = bitcast <32 x i8> %call to <4 x i64> ret <4 x i64> %res } -declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_packus_epi32: @@ -1975,11 +1987,15 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1) + %1 = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <16 x i32> + %2 = icmp slt <16 x i32> %1, + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> + %4 = icmp sgt <16 x i32> %3, zeroinitializer + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer + %call = trunc <16 x i32> %5 to <16 x i16> %res = bitcast <16 x i16> %call to <4 x i64> ret <4 x i64> %res } -declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_permute2x128_si256: Index: llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -4,6 +4,190 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=X64 --check-prefix=X64-AVX512 +define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) { +; X86-LABEL: test_x86_avx2_packssdw: +; X86: ## %bb.0: +; X86-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packssdw: +; X64: ## %bb.0: +; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_packssdw_unary(<8 x i32> %a) { +; X86-LABEL: test_x86_avx2_packssdw_unary: +; X86: ## %bb.0: +; X86-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packssdw_unary: +; X64: ## %bb.0: +; X64-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %a) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +define <16 x i16> @test_x86_avx2_packssdw_fold() { +; X86-LABEL: test_x86_avx2_packssdw_fold: +; X86: ## %bb.0: +; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packssdw_fold: +; X64: ## %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X64-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> ) + ret <16 x i16> %res +} + + +define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) { +; X86-LABEL: test_x86_avx2_packsswb: +; X86: ## %bb.0: +; X86-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packsswb: +; X64: ## %bb.0: +; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_packsswb_unary(<16 x i16> %a) { +; X86-LABEL: test_x86_avx2_packsswb_unary: +; X86: ## %bb.0: +; X86-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packsswb_unary: +; X64: ## %bb.0: +; X64-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %a) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +define <32 x i8> @test_x86_avx2_packsswb_fold() { +; X86-LABEL: test_x86_avx2_packsswb_fold: +; X86: ## %bb.0: +; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packsswb_fold: +; X64: ## %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) + ret <32 x i8> %res +} + + +define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) { +; X86-LABEL: test_x86_avx2_packuswb: +; X86: ## %bb.0: +; X86-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packuswb: +; X64: ## %bb.0: +; X64-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_packuswb_unary(<16 x i16> %a) { +; X86-LABEL: test_x86_avx2_packuswb_unary: +; X86: ## %bb.0: +; X86-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packuswb_unary: +; X64: ## %bb.0: +; X64-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %a) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +define <32 x i8> @test_x86_avx2_packuswb_fold() { +; X86-LABEL: test_x86_avx2_packuswb_fold: +; X86: ## %bb.0: +; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packuswb_fold: +; X64: ## %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) + ret <32 x i8> %res +} + + +define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) { +; X86-LABEL: test_x86_avx2_packusdw: +; X86: ## %bb.0: +; X86-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packusdw: +; X64: ## %bb.0: +; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_packusdw_unary(<8 x i32> %a) { +; X86-LABEL: test_x86_avx2_packusdw_unary: +; X86: ## %bb.0: +; X86-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packusdw_unary: +; X64: ## %bb.0: +; X64-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %a) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +define <16 x i16> @test_x86_avx2_packusdw_fold() { +; X86-LABEL: test_x86_avx2_packusdw_fold: +; X86: ## %bb.0: +; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_packusdw_fold: +; X64: ## %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X64-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) + ret <16 x i16> %res +} + + define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { ; X86-LABEL: test_x86_avx2_pblendw: ; X86: ## %bb.0: Index: llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -4,183 +4,6 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL -define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packssdw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packssdw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packssdw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packssdw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_packssdw_fold() { -; X86-AVX-LABEL: test_x86_avx2_packssdw_fold: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packssdw_fold: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI1_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packssdw_fold: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI1_0-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packssdw_fold: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI1_0-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> ) - ret <16 x i16> %res -} - - -define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packsswb: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packsswb: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packsswb: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packsswb: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_packsswb_fold() { -; X86-AVX-LABEL: test_x86_avx2_packsswb_fold: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI3_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packsswb_fold: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) - ret <32 x i8> %res -} - - -define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packuswb: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packuswb: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packuswb: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packuswb: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_packuswb_fold() { -; X86-AVX-LABEL: test_x86_avx2_packuswb_fold: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI5_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packuswb_fold: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) - ret <32 x i8> %res -} - - define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { ; X86-AVX-LABEL: test_x86_avx2_padds_b: ; X86-AVX: ## %bb.0: @@ -1299,65 +1122,6 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone -define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packusdw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packusdw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packusdw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packusdw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_packusdw_fold() { -; X86-AVX-LABEL: test_x86_avx2_packusdw_fold: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) - ret <16 x i16> %res -} - - define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) { ; X86-LABEL: test_x86_avx2_pblendvb: ; X86: ## %bb.0: @@ -2071,36 +1835,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI78_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI78_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI78_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -2136,36 +1900,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI80_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI80_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI80_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res Index: llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1137,6 +1137,640 @@ %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0) ret <32 x i16> %res } + + +define <32 x i16> @test_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { +; AVX512BW-LABEL: test_packs_epi32_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packs_epi32_rr_512_unary(<16 x i32> %a) { +; AVX512BW-LABEL: test_packs_epi32_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackssdw %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackssdw %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %a) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_packs_epi32_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { +; AVX512BW-LABEL: test_packs_epi32_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { +; AVX512BW-LABEL: test_packs_epi32_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_packs_epi32_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_packs_epi32_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { +; AVX512BW-LABEL: test_packs_epi32_rmb_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi32_rmb_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) + +define <64 x i8> @test_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_packs_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_packs_epi16_rr_512_unary(<32 x i16> %a) { +; AVX512BW-LABEL: test_packs_epi16_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpacksswb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi16_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpacksswb %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %a) + ret <64 x i8> %1 +} + +define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { +; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { +; AVX512BW-LABEL: test_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +define <64 x i8> @test_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_packs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { +; AVX512BW-LABEL: test_packs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { +; AVX512BW-LABEL: test_packs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) + +define <32 x i16> @test_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { +; AVX512BW-LABEL: test_packus_epi32_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packus_epi32_rr_512_unary(<16 x i32> %a) { +; AVX512BW-LABEL: test_packus_epi32_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackusdw %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackusdw %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %a) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_packus_epi32_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { +; AVX512BW-LABEL: test_packus_epi32_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { +; AVX512BW-LABEL: test_packus_epi32_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_packus_epi32_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_packus_epi32_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <16 x i32>, <16 x i32>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { +; AVX512BW-LABEL: test_packus_epi32_rmb_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rmb_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_packus_epi32_rmbk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rmbk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_packus_epi32_rmbkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi32_rmbkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) + +define <64 x i8> @test_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_packus_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_packus_epi16_rr_512_unary(<32 x i16> %a) { +; AVX512BW-LABEL: test_packus_epi16_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackuswb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackuswb %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %a) + ret <64 x i8> %1 +} + +define <64 x i8> @test_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { +; AVX512BW-LABEL: test_packus_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { +; AVX512BW-LABEL: test_packus_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +define <64 x i8> @test_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_packus_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + ret <64 x i8> %1 +} + +define <64 x i8> @test_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { +; AVX512BW-LABEL: test_packus_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru + ret <64 x i8> %3 +} + +define <64 x i8> @test_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { +; AVX512BW-LABEL: test_packus_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_packus_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) + define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { ; AVX512BW-LABEL: test_mask_packs_epi32_rr_512: ; AVX512BW: ## %bb.0: @@ -1151,6 +1785,20 @@ ret <32 x i16> %res } +define <32 x i16> @test_mask_packs_epi32_rr_512_unary(<16 x i32> %a) { +; AVX512BW-LABEL: test_mask_packs_epi32_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackssdw %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackssdw %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %a, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { ; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512: ; AVX512BW: ## %bb.0: @@ -1257,15 +1905,15 @@ ret <32 x i16> %res } -define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512: +define <32 x i16> @test_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_packs_epi32_rmbk_512: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512: +; AVX512F-32-LABEL: test_packs_epi32_rmbk_512: ; AVX512F-32: # %bb.0: ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 @@ -1279,14 +1927,14 @@ ret <32 x i16> %res } -define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512: +define <32 x i16> @test_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_packs_epi32_rmbkz_512: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512: +; AVX512F-32-LABEL: test_packs_epi32_rmbkz_512: ; AVX512F-32: # %bb.0: ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 @@ -1315,15 +1963,29 @@ ret <64 x i8> %res } -define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512: +define <64 x i8> @test_mask_packs_epi16_rr_512_unary(<32 x i16> %a) { +; AVX512BW-LABEL: test_mask_packs_epi16_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpacksswb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpacksswb %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %a, <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8> @test_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { +; AVX512BW-LABEL: test_packs_epi16_rrk_512: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; -; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512: +; AVX512F-32-LABEL: test_packs_epi16_rrk_512: ; AVX512F-32: # %bb.0: ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} @@ -1420,6 +2082,20 @@ ret <32 x i16> %res } +define <32 x i16> @test_mask_packus_epi32_rr_512_unary(<16 x i32> %a) { +; AVX512BW-LABEL: test_mask_packus_epi32_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackusdw %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackusdw %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %a, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { ; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512: ; AVX512BW: ## %bb.0: @@ -1584,6 +2260,20 @@ ret <64 x i8> %res } +define <64 x i8> @test_mask_packus_epi16_rr_512_unary(<32 x i16> %a) { +; AVX512BW-LABEL: test_mask_packus_epi16_rr_512_unary: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpackuswb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512_unary: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpackuswb %zmm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %a, <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %res +} + define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { ; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512: ; AVX512BW: ## %bb.0: Index: llvm/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2,583 +2,6 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 -define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { -; AVX512BW-LABEL: test_mask_packs_epi32_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { -; AVX512BW-LABEL: test_mask_packs_epi32_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <16 x i32>, <16 x i32>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <16 x i32>, <16 x i32>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <16 x i32>, <16 x i32>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) - -define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_packs_epi16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) - ret <64 x i8> %1 -} - -define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru - ret <64 x i8> %3 -} - -define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer - ret <64 x i8> %3 -} - -define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_packs_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) - ret <64 x i8> %1 -} - -define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru - ret <64 x i8> %3 -} - -define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { -; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer - ret <64 x i8> %3 -} - -declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) - - -define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { -; AVX512BW-LABEL: test_mask_packus_epi32_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { -; AVX512BW-LABEL: test_mask_packus_epi32_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <16 x i32>, <16 x i32>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <16 x i32>, <16 x i32>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <16 x i32>, <16 x i32>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { -; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) - -define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_packus_epi16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) - ret <64 x i8> %1 -} - -define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru - ret <64 x i8> %3 -} - -define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rdi, %k1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer - ret <64 x i8> %3 -} - -define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_packus_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) - ret <64 x i8> %1 -} - -define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru - ret <64 x i8> %3 -} - -define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { -; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer - ret <64 x i8> %3 -} - -declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) - define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: ; AVX512BW: ## %bb.0: Index: llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -2109,6 +2109,836 @@ %res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0) ret <16 x i16> %res } + +define <8 x i16> @test_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_packs_epi32_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packs_epi32_rr_128_unary(<4 x i32> %a) { +; CHECK-LABEL: test_packs_epi32_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %a) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_packs_epi32_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_packs_epi32_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { +; CHECK-LABEL: test_packs_epi32_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_packs_epi32_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_packs_epi32_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_packs_epi32_rmb_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_packs_epi32_rmbk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_packs_epi32_rmbkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) + +define <16 x i16> @test_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_packs_epi32_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packs_epi32_rr_256_unary(<8 x i32> %a) { +; CHECK-LABEL: test_packs_epi32_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %a) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_packs_epi32_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_packs_epi32_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { +; CHECK-LABEL: test_packs_epi32_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_packs_epi32_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_packs_epi32_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_packs_epi32_rmb_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_packs_epi32_rmbk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_packs_epi32_rmbkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) + +define <16 x i8> @test_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_packs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_packs_epi16_rr_128_unary(<8 x i16> %a) { +; CHECK-LABEL: test_packs_epi16_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %a) + ret <16 x i8> %1 +} + +define <16 x i8> @test_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_packs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_packs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_packs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_packs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_packs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) + +define <32 x i8> @test_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_packs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_packs_epi16_rr_256_unary(<16 x i16> %a) { +; CHECK-LABEL: test_packs_epi16_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %a) + ret <32 x i8> %1 +} + +define <32 x i8> @test_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_packs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_packs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_packs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_packs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_packs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) + + +define <8 x i16> @test_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_packus_epi32_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packus_epi32_rr_128_unary(<4 x i32> %a) { +; CHECK-LABEL: test_packus_epi32_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %a) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_packus_epi32_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_packus_epi32_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { +; CHECK-LABEL: test_packus_epi32_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_packus_epi32_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_packus_epi32_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <4 x i32>, <4 x i32>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_packus_epi32_rmb_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_packus_epi32_rmbk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_packus_epi32_rmbkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 + %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) + +define <16 x i16> @test_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_packus_epi32_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packus_epi32_rr_256_unary(<8 x i32> %a) { +; CHECK-LABEL: test_packus_epi32_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %a) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_packus_epi32_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_packus_epi32_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { +; CHECK-LABEL: test_packus_epi32_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_packus_epi32_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_packus_epi32_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i32>, <8 x i32>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { +; CHECK-LABEL: test_packus_epi32_rmb_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_packus_epi32_rmbk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_packus_epi32_rmbkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %q = load i32, i32* %ptr_b + %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 + %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) + +define <16 x i8> @test_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_packus_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_packus_epi16_rr_128_unary(<8 x i16> %a) { +; CHECK-LABEL: test_packus_epi16_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %a) + ret <16 x i8> %1 +} + +define <16 x i8> @test_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_packus_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_packus_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_packus_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_packus_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_packus_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) + +define <32 x i8> @test_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_packus_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_packus_epi16_rr_256_unary(<16 x i16> %a) { +; CHECK-LABEL: test_packus_epi16_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %a) + ret <32 x i8> %1 +} + +define <32 x i8> @test_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_packus_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_packus_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_packus_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_packus_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_packus_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) + define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_packs_epi32_rr_128: ; CHECK: ## %bb.0: @@ -2118,6 +2948,15 @@ ret <8 x i16> %res } +define <8 x i16> @test_mask_packs_epi32_rr_128_unary(<4 x i32> %a) { +; CHECK-LABEL: test_mask_packs_epi32_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %a, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_packs_epi32_rrk_128: ; CHECK: ## %bb.0: @@ -2222,6 +3061,15 @@ ret <16 x i16> %res } +define <16 x i16> @test_mask_packs_epi32_rr_256_unary(<8 x i32> %a) { +; CHECK-LABEL: test_mask_packs_epi32_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %a, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { ; CHECK-LABEL: test_mask_packs_epi32_rrk_256: ; CHECK: ## %bb.0: @@ -2326,6 +3174,15 @@ ret <16 x i8> %res } +define <16 x i8> @test_mask_packs_epi16_rr_128_unary(<8 x i16> %a) { +; CHECK-LABEL: test_mask_packs_epi16_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %a, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { ; CHECK-LABEL: test_mask_packs_epi16_rrk_128: ; CHECK: ## %bb.0: @@ -2391,6 +3248,15 @@ ret <32 x i8> %res } +define <32 x i8> @test_mask_packs_epi16_rr_256_unary(<16 x i16> %a) { +; CHECK-LABEL: test_mask_packs_epi16_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %a, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { ; CHECK-LABEL: test_mask_packs_epi16_rrk_256: ; CHECK: ## %bb.0: @@ -2457,6 +3323,15 @@ ret <8 x i16> %res } +define <8 x i16> @test_mask_packus_epi32_rr_128_unary(<4 x i32> %a) { +; CHECK-LABEL: test_mask_packus_epi32_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %a, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { ; CHECK-LABEL: test_mask_packus_epi32_rrk_128: ; CHECK: ## %bb.0: @@ -2561,6 +3436,15 @@ ret <16 x i16> %res } +define <16 x i16> @test_mask_packus_epi32_rr_256_unary(<8 x i32> %a) { +; CHECK-LABEL: test_mask_packus_epi32_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %a, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { ; CHECK-LABEL: test_mask_packus_epi32_rrk_256: ; CHECK: ## %bb.0: @@ -2665,6 +3549,15 @@ ret <16 x i8> %res } +define <16 x i8> @test_mask_packus_epi16_rr_128_unary(<8 x i16> %a) { +; CHECK-LABEL: test_mask_packus_epi16_rr_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %a, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { ; CHECK-LABEL: test_mask_packus_epi16_rrk_128: ; CHECK: ## %bb.0: @@ -2730,6 +3623,15 @@ ret <32 x i8> %res } +define <32 x i8> @test_mask_packus_epi16_rr_256_unary(<16 x i16> %a) { +; CHECK-LABEL: test_mask_packus_epi16_rr_256_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %a, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { ; CHECK-LABEL: test_mask_packus_epi16_rrk_256: ; CHECK: ## %bb.0: Index: llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1,763 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s -define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_mask_packs_epi32_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %1 -} - -define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { -; CHECK-LABEL: test_mask_packs_epi32_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <4 x i32>, <4 x i32>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %1 -} - -define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <4 x i32>, <4 x i32>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <4 x i32>, <4 x i32>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { -; CHECK-LABEL: test_mask_packs_epi32_rmb_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 - %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %1 -} - -define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmbk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 - %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 - %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) - -define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: test_mask_packs_epi32_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - ret <16 x i16> %1 -} - -define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { -; CHECK-LABEL: test_mask_packs_epi32_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i32>, <8 x i32>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - ret <16 x i16> %1 -} - -define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i32>, <8 x i32>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i32>, <8 x i32>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { -; CHECK-LABEL: test_mask_packs_epi32_rmb_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 - %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - ret <16 x i16> %1 -} - -define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmbk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 - %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 - %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) - -define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_packs_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) - ret <16 x i8> %1 -} - -define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_packs_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) - ret <16 x i8> %1 -} - -define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) - -define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_packs_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) - ret <32 x i8> %1 -} - -define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_packs_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) - ret <32 x i8> %1 -} - -define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_packs_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) - - -define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_mask_packus_epi32_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %1 -} - -define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { -; CHECK-LABEL: test_mask_packus_epi32_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <4 x i32>, <4 x i32>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %1 -} - -define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <4 x i32>, <4 x i32>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <4 x i32>, <4 x i32>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { -; CHECK-LABEL: test_mask_packus_epi32_rmb_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 - %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %1 -} - -define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmbk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 - %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 - %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) - -define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: test_mask_packus_epi32_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - ret <16 x i16> %1 -} - -define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { -; CHECK-LABEL: test_mask_packus_epi32_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i32>, <8 x i32>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - ret <16 x i16> %1 -} - -define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i32>, <8 x i32>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i32>, <8 x i32>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { -; CHECK-LABEL: test_mask_packus_epi32_rmb_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 - %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - ret <16 x i16> %1 -} - -define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmbk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 - %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %q = load i32, i32* %ptr_b - %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 - %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) - -define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_packus_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) - ret <16 x i8> %1 -} - -define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_packus_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) - ret <16 x i8> %1 -} - -define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) - -define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_packus_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) - ret <32 x i8> %1 -} - -define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_packus_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) - ret <32 x i8> %1 -} - -define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_packus_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) - define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_adds_epi16_rr_128: ; CHECK: ## %bb.0: Index: llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2002,11 +2002,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1) + %1 = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <16 x i32> + %2 = icmp slt <16 x i16> %1, + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> + %4 = icmp sgt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %res = trunc <16 x i16> %5 to <16 x i8> %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_packs_epi32: @@ -2020,11 +2024,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1) + %1 = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <8 x i32> + %2 = icmp slt <8 x i32> %1, + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> + %4 = icmp sgt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %res = trunc <8 x i32> %5 to <8 x i16> %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_packus_epi16: @@ -2038,11 +2046,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1) + %1 = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <16 x i32> + %2 = icmp slt <16 x i16> %1, + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> + %4 = icmp sgt <16 x i16> %3, zeroinitializer + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer + %res = trunc <16 x i16> %5 to <16 x i8> %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone define void @test_mm_pause() nounwind { ; X32-LABEL: test_mm_pause: Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -226,6 +226,100 @@ } declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone + +define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_x86_sse2_packssdw_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: packssdw %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_packssdw_128_unary(<4 x i32> %a) { +; CHECK-LABEL: test_x86_sse2_packssdw_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: packssdw %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %a) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +define <8 x i16> @test_x86_sse2_packssdw_128_fold() { +; CHECK-LABEL: test_x86_sse2_packssdw_128_fold: +; CHECK: ## %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> ) + ret <8 x i16> %res +} + + +define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test_x86_sse2_packsswb_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: packsswb %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_packsswb_128_unary(<8 x i16> %a) { +; CHECK-LABEL: test_x86_sse2_packsswb_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: packsswb %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %a) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +define <16 x i8> @test_x86_sse2_packsswb_128_fold() { +; CHECK-LABEL: test_x86_sse2_packsswb_128_fold: +; CHECK: ## %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> zeroinitializer) + ret <16 x i8> %res +} + + +define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test_x86_sse2_packuswb_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: packuswb %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_packuswb_128_unary(<8 x i16> %a) { +; CHECK-LABEL: test_x86_sse2_packuswb_128_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: packuswb %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %a) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +define <16 x i8> @test_x86_sse2_packuswb_128_fold() { +; CHECK-LABEL: test_x86_sse2_packuswb_128_fold: +; CHECK: ## %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> zeroinitializer) + ret <16 x i8> %res +} + + define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: mm_avg_epu8: ; CHECK: ## %bb.0: Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -741,147 +741,6 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone -define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) { -; SSE-LABEL: test_x86_sse2_packssdw_128: -; SSE: ## %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6b,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_packssdw_128: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_packssdw_128: -; SKX: ## %bb.0: -; SKX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_packssdw_128_fold() { -; SSE-LABEL: test_x86_sse2_packssdw_128_fold: -; SSE: ## %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] -; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 3, value: LCPI35_0, kind: FK_Data_4 -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_packssdw_128_fold: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] -; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI35_0, kind: FK_Data_4 -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_packssdw_128_fold: -; SKX: ## %bb.0: -; SKX-NEXT: vmovaps LCPI35_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] -; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI35_0, kind: FK_Data_4 -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> ) - ret <8 x i16> %res -} - - -define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_packsswb_128: -; SSE: ## %bb.0: -; SSE-NEXT: packsswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x63,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_packsswb_128: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_packsswb_128: -; SKX: ## %bb.0: -; SKX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_packsswb_128_fold() { -; SSE-LABEL: test_x86_sse2_packsswb_128_fold: -; SSE: ## %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 3, value: LCPI37_0, kind: FK_Data_4 -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_packsswb_128_fold: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI37_0, kind: FK_Data_4 -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_packsswb_128_fold: -; SKX: ## %bb.0: -; SKX-NEXT: vmovaps LCPI37_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI37_0, kind: FK_Data_4 -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> zeroinitializer) - ret <16 x i8> %res -} - - -define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_packuswb_128: -; SSE: ## %bb.0: -; SSE-NEXT: packuswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_packuswb_128: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x67,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_packuswb_128: -; SKX: ## %bb.0: -; SKX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_packuswb_128_fold() { -; SSE-LABEL: test_x86_sse2_packuswb_128_fold: -; SSE: ## %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 3, value: LCPI39_0, kind: FK_Data_4 -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_packuswb_128_fold: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI39_0, kind: FK_Data_4 -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_packuswb_128_fold: -; SKX: ## %bb.0: -; SKX-NEXT: vmovaps LCPI39_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI39_0, kind: FK_Data_4 -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> zeroinitializer) - ret <16 x i8> %res -} - - define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: test_x86_sse2_padds_b: ; SSE: ## %bb.0: Index: llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -847,11 +847,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1) + %1 = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <8 x i32> + %2 = icmp slt <8 x i32> %1, + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> + %4 = icmp sgt <8 x i32> %3, zeroinitializer + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer + %res = trunc <8 x i32> %5 to <8 x i16> %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone define <2 x double> @test_mm_round_pd(<2 x double> %a0) { ; X32-LABEL: test_mm_round_pd: Index: llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -83,6 +83,37 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone +define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_x86_sse41_packusdw: +; CHECK: ## %bb.0: +; CHECK-NEXT: packusdw %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone + + +define <8 x i16> @test_x86_sse41_packusdw_unary(<4 x i32> %a) { +; CHECK-LABEL: test_x86_sse41_packusdw_unary: +; CHECK: ## %bb.0: +; CHECK-NEXT: packusdw %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %a) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +define <8 x i16> @test_x86_sse41_packusdw_fold() { +; CHECK-LABEL: test_x86_sse41_packusdw_fold: +; CHECK: ## %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> ) + ret <8 x i16> %res +} + + define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: test_x86_sse41_pblendw: ; CHECK: ## %bb.0: Index: llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -114,53 +114,6 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone -define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) { -; SSE41-LABEL: test_x86_sse41_packusdw: -; SSE41: ## %bb.0: -; SSE41-NEXT: packusdw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x2b,0xc1] -; SSE41-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse41_packusdw: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2b,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse41_packusdw: -; SKX: ## %bb.0: -; SKX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone - - -define <8 x i16> @test_x86_sse41_packusdw_fold() { -; SSE41-LABEL: test_x86_sse41_packusdw_fold: -; SSE41: ## %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] -; SSE41-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; SSE41-NEXT: ## fixup A - offset: 3, value: LCPI7_0, kind: FK_Data_4 -; SSE41-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse41_packusdw_fold: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] -; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse41_packusdw_fold: -; SKX: ## %bb.0: -; SKX-NEXT: vmovaps LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] -; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> ) - ret <8 x i16> %res -} - - define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; SSE41-LABEL: test_x86_sse41_pblendvb: ; SSE41: ## %bb.0: Index: llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll =================================================================== --- llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll +++ llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll @@ -4,42 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone -declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone -define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory { -entry: - %c = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) nounwind - ret <8 x i16> %c -} - -; CHECK-LABEL: @Test_packssdw_128( -; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer -; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32> -; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer -; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32> -; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128( -; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128( -; CHECK: ret <8 x i16> - - -define <32 x i8> @Test_avx_packuswb(<16 x i16> %a, <16 x i16> %b) sanitize_memory { -entry: - %c = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind - ret <32 x i8> %c -} - -; CHECK-LABEL: @Test_avx_packuswb( -; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer -; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16> -; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer -; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16> -; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packsswb( -; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packuswb( -; CHECK: ret <32 x i8> - - define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory { entry: %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind Index: llvm/test/Transforms/InstCombine/X86/x86-pack.ll =================================================================== --- llvm/test/Transforms/InstCombine/X86/x86-pack.ll +++ /dev/null @@ -1,366 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s - -; -; UNDEF Elts -; - -define <8 x i16> @undef_packssdw_128() { -; CHECK-LABEL: @undef_packssdw_128( -; CHECK-NEXT: ret <8 x i16> undef -; - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef) - ret <8 x i16> %1 -} - -define <8 x i16> @undef_packusdw_128() { -; CHECK-LABEL: @undef_packusdw_128( -; CHECK-NEXT: ret <8 x i16> undef -; - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef) - ret <8 x i16> %1 -} - -define <16 x i8> @undef_packsswb_128() { -; CHECK-LABEL: @undef_packsswb_128( -; CHECK-NEXT: ret <16 x i8> undef -; - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef) - ret <16 x i8> %1 -} - -define <16 x i8> @undef_packuswb_128() { -; CHECK-LABEL: @undef_packuswb_128( -; CHECK-NEXT: ret <16 x i8> undef -; - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef) - ret <16 x i8> %1 -} - -define <16 x i16> @undef_packssdw_256() { -; CHECK-LABEL: @undef_packssdw_256( -; CHECK-NEXT: ret <16 x i16> undef -; - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef) - ret <16 x i16> %1 -} - -define <16 x i16> @undef_packusdw_256() { -; CHECK-LABEL: @undef_packusdw_256( -; CHECK-NEXT: ret <16 x i16> undef -; - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef) - ret <16 x i16> %1 -} - -define <32 x i8> @undef_packsswb_256() { -; CHECK-LABEL: @undef_packsswb_256( -; CHECK-NEXT: ret <32 x i8> undef -; - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef) - ret <32 x i8> %1 -} - -define <32 x i8> @undef_packuswb_256() { -; CHECK-LABEL: @undef_packuswb_256( -; CHECK-NEXT: ret <32 x i8> undef -; - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef) - ret <32 x i8> %1 -} - -define <32 x i16> @undef_packssdw_512() { -; CHECK-LABEL: @undef_packssdw_512( -; CHECK-NEXT: ret <32 x i16> undef -; - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef) - ret <32 x i16> %1 -} - -define <32 x i16> @undef_packusdw_512() { -; CHECK-LABEL: @undef_packusdw_512( -; CHECK-NEXT: ret <32 x i16> undef -; - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef) - ret <32 x i16> %1 -} - -define <64 x i8> @undef_packsswb_512() { -; CHECK-LABEL: @undef_packsswb_512( -; CHECK-NEXT: ret <64 x i8> undef -; - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef) - ret <64 x i8> %1 -} - -define <64 x i8> @undef_packuswb_512() { -; CHECK-LABEL: @undef_packuswb_512( -; CHECK-NEXT: ret <64 x i8> undef -; - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef) - ret <64 x i8> %1 -} - -; -; Constant Folding -; - -define <8 x i16> @fold_packssdw_128() { -; CHECK-LABEL: @fold_packssdw_128( -; CHECK-NEXT: ret <8 x i16> -; - %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> , <4 x i32> zeroinitializer) - ret <8 x i16> %1 -} - -define <8 x i16> @fold_packusdw_128() { -; CHECK-LABEL: @fold_packusdw_128( -; CHECK-NEXT: ret <8 x i16> -; - %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> ) - ret <8 x i16> %1 -} - -define <16 x i8> @fold_packsswb_128() { -; CHECK-LABEL: @fold_packsswb_128( -; CHECK-NEXT: ret <16 x i8> -; - %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef) - ret <16 x i8> %1 -} - -define <16 x i8> @fold_packuswb_128() { -; CHECK-LABEL: @fold_packuswb_128( -; CHECK-NEXT: ret <16 x i8> -; - %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> ) - ret <16 x i8> %1 -} - -define <16 x i16> @fold_packssdw_256() { -; CHECK-LABEL: @fold_packssdw_256( -; CHECK-NEXT: ret <16 x i16> -; - %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> , <8 x i32> undef) - ret <16 x i16> %1 -} - -define <16 x i16> @fold_packusdw_256() { -; CHECK-LABEL: @fold_packusdw_256( -; CHECK-NEXT: ret <16 x i16> -; - %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> , <8 x i32> ) - ret <16 x i16> %1 -} - -define <32 x i8> @fold_packsswb_256() { -; CHECK-LABEL: @fold_packsswb_256( -; CHECK-NEXT: ret <32 x i8> -; - %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer) - ret <32 x i8> %1 -} - -define <32 x i8> @fold_packuswb_256() { -; CHECK-LABEL: @fold_packuswb_256( -; CHECK-NEXT: ret <32 x i8> -; - %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> ) - ret <32 x i8> %1 -} - -define <32 x i16> @fold_packssdw_512() { -; CHECK-LABEL: @fold_packssdw_512( -; CHECK-NEXT: ret <32 x i16> -; - %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> , <16 x i32> undef) - ret <32 x i16> %1 -} - -define <32 x i16> @fold_packusdw_512() { -; CHECK-LABEL: @fold_packusdw_512( -; CHECK-NEXT: ret <32 x i16> -; - %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> , <16 x i32> ) - ret <32 x i16> %1 -} - -define <64 x i8> @fold_packsswb_512() { -; CHECK-LABEL: @fold_packsswb_512( -; CHECK-NEXT: ret <64 x i8> -; - %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer) - ret <64 x i8> %1 -} - -define <64 x i8> @fold_packuswb_512() { -; CHECK-LABEL: @fold_packuswb_512( -; CHECK-NEXT: ret <64 x i8> -; - %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> ) - ret <64 x i8> %1 -} - -; -; Demanded Elts -; - -define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @elts_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP2]] -; - %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> - %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> - %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2) - %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> - ret <8 x i16> %4 -} - -define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @elts_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = insertelement <4 x i32> %a0, i32 0, i32 0 - %2 = insertelement <4 x i32> %a1, i32 0, i32 3 - %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2) - %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> - ret <8 x i16> %4 -} - -define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @elts_packsswb_128( -; CHECK-NEXT: ret <16 x i8> zeroinitializer -; - %1 = insertelement <8 x i16> %a0, i16 0, i32 0 - %2 = insertelement <8 x i16> %a1, i16 0, i32 0 - %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2) - %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> - ret <16 x i8> %4 -} - -define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @elts_packuswb_128( -; CHECK-NEXT: ret <16 x i8> undef -; - %1 = insertelement <8 x i16> undef, i16 0, i32 0 - %2 = insertelement <8 x i16> undef, i16 0, i32 0 - %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2) - %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> - ret <16 x i8> %4 -} - -define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: @elts_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> - %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> - %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2) - %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> - ret <16 x i16> %4 -} - -define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: @elts_packusdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> - %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> - %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2) - %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> - ret <16 x i16> %4 -} - -define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: @elts_packsswb_256( -; CHECK-NEXT: ret <32 x i8> zeroinitializer -; - %1 = insertelement <16 x i16> %a0, i16 0, i32 0 - %2 = insertelement <16 x i16> %a1, i16 0, i32 8 - %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2) - %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> - ret <32 x i8> %4 -} - -define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: @elts_packuswb_256( -; CHECK-NEXT: ret <32 x i8> undef -; - %1 = insertelement <16 x i16> undef, i16 0, i32 1 - %2 = insertelement <16 x i16> undef, i16 0, i32 0 - %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2) - %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer - ret <32 x i8> %4 -} - -define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: @elts_packssdw_512( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef) -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> - %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> - %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2) - %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> - ret <32 x i16> %4 -} - -define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: @elts_packusdw_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> - %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> - %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2) - %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> - ret <32 x i16> %4 -} - -define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) { -; CHECK-LABEL: @elts_packsswb_512( -; CHECK-NEXT: ret <64 x i8> zeroinitializer -; - %1 = insertelement <32 x i16> %a0, i16 0, i32 0 - %2 = insertelement <32 x i16> %a1, i16 0, i32 8 - %3 = insertelement <32 x i16> %1, i16 0, i32 16 - %4 = insertelement <32 x i16> %2, i16 0, i32 24 - %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4) - %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> - ret <64 x i8> %6 -} - -define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) { -; CHECK-LABEL: @elts_packuswb_512( -; CHECK-NEXT: ret <64 x i8> undef -; - %1 = insertelement <32 x i16> undef, i16 0, i32 1 - %2 = insertelement <32 x i16> undef, i16 0, i32 0 - %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2) - %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer - ret <64 x i8> %4 -} - -declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone -declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone -declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone -declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone - -declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone -declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone -declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone -declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone - -declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone -declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone -declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone -declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone