Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -378,30 +378,6 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; @@ -1627,30 +1603,6 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; - def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; @@ -4695,78 +4647,6 @@ } // Integer arithmetic ops let TargetPrefix = "x86" in { - def int_x86_avx512_mask_padds_b_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -84,7 +84,19 @@ // like to use this information to remove upgrade code for some older // intrinsics. It is currently undecided how we will determine that future // point. - if (Name=="ssse3.pabs.b.128" || // Added in 6.0 + if (Name.startswith("sse2.padds") || // Added in 7.0 + Name.startswith("sse2.paddus") || // Added in 7.0 + Name.startswith("sse2.psubs") || // Added in 7.0 + Name.startswith("sse2.psubus") || // Added in 7.0 + Name.startswith("avx2.padds") || // Added in 7.0 + Name.startswith("avx2.paddus") || // Added in 7.0 + Name.startswith("avx2.psubs") || // Added in 7.0 + Name.startswith("avx2.psubus") || // Added in 7.0 + Name.startswith("avx512.mask.padds") || // Added in 7.0 + Name.startswith("avx512.mask.paddus") || // Added in 7.0 + Name.startswith("avx512.mask.psubs") || // Added in 7.0 + Name.startswith("avx512.mask.psubus") || // Added in 7.0 + Name=="ssse3.pabs.b.128" || // Added in 6.0 Name=="ssse3.pabs.w.128" || // Added in 6.0 Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 @@ -845,6 +857,77 @@ return EmitX86Select(Builder, Mask, Align, Passthru); } +static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder, CallInst &CI, + bool IsSigned, bool IsAddition) { + // Get elements. + Value *Op0 = CI.getArgOperand(0); + Value *Op1 = CI.getArgOperand(1); + + // Extend elements. + Type *ResultType = CI.getType(); + unsigned NumElts = ResultType->getVectorNumElements(); + + Value *Res; + if (!IsAddition && !IsSigned) { + Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Op1); + Value *Select = Builder.CreateSelect(ICmp, Op0, Op1); + Res = Builder.CreateSub(Select, Op1); + } else { + Type *EltType = ResultType->getVectorElementType(); + Type *ExtEltType = EltType == Builder.getInt8Ty() ? Builder.getInt16Ty() + : Builder.getInt32Ty(); + Type *ExtVT = VectorType::get(ExtEltType, NumElts); + Op0 = IsSigned ? Builder.CreateSExt(Op0, ExtVT) + : Builder.CreateZExt(Op0, ExtVT); + Op1 = IsSigned ? Builder.CreateSExt(Op1, ExtVT) + : Builder.CreateZExt(Op1, ExtVT); + + // Perform addition/substraction. + Res = IsAddition ? Builder.CreateAdd(Op0, Op1) + : Builder.CreateSub(Op0, Op1); + + // Create a vector of maximum values of not extended type + // (if overflow occurs, it will be saturated to that value). + unsigned EltSizeInBits = EltType->getPrimitiveSizeInBits(); + APInt MaxInt = IsSigned ? APInt::getSignedMaxValue(EltSizeInBits) + : APInt::getMaxValue(EltSizeInBits); + Value *MaxVec = ConstantInt::get(ResultType, MaxInt); + // Extend so that it can be compared to result of add/sub. + MaxVec = IsSigned ? Builder.CreateSExt(MaxVec, ExtVT) + : Builder.CreateZExt(MaxVec, ExtVT); + + // Saturate overflow. + ICmpInst::Predicate Pred = IsSigned ? ICmpInst::ICMP_SLE + : ICmpInst::ICMP_ULE; + Value *Cmp = Builder.CreateICmp(Pred, Res, + MaxVec); // 1 if no overflow. + Res = Builder.CreateSelect(Cmp, Res, + MaxVec); // If overflowed, copy from max vec. + + // Saturate underflow. + if (IsSigned) { + APInt MinInt = APInt::getSignedMinValue(EltSizeInBits); + Value *MinVec = ConstantInt::get(ResultType, MinInt); + // Extend so that it can be compared to result of add/sub. + MinVec = Builder.CreateSExt(MinVec, ExtVT); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Res, + MinVec); // 1 if no underflow. + Res = Builder.CreateSelect(Cmp, Res, + MinVec); // If underflowed, copy from min vec. + } + + // Truncate to original type. + Res = Builder.CreateTrunc(Res, ResultType); + } + + if (CI.getNumArgOperands() == 4) { // For masked intrinsics. + Value *VecSRC = CI.getArgOperand(2); + Value *Mask = CI.getArgOperand(3); + Res = EmitX86Select(Builder, Mask, Res, VecSRC); + } + return Res; +} + static Value *UpgradeMaskedStore(IRBuilder<> &Builder, Value *Ptr, Value *Data, Value *Mask, bool Aligned) { @@ -1684,6 +1767,26 @@ ShuffleMask); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); + } else if (IsX86 && (Name.startswith("sse2.padds") || + Name.startswith("avx2.padds") || + Name.startswith("avx512.mask.padds"))) { + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, + true, true); // Signed add. + } else if (IsX86 && (Name.startswith("sse2.paddus") || + Name.startswith("avx2.paddus") || + Name.startswith("avx512.mask.paddus"))) { + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, + false, true); // Unsigned add. + } else if (IsX86 && (Name.startswith("sse2.psubs") || + Name.startswith("avx2.psubs") || + Name.startswith("avx512.mask.psubs"))) { + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, + true, false); // Signed sub. + } else if (IsX86 && (Name.startswith("sse2.psubus") || + Name.startswith("avx2.psubus") || + Name.startswith("avx512.mask.psubus"))) { + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, + false, false); // Unsigned sub. } else if (IsX86 && (Name.startswith("avx2.pbroadcast") || Name.startswith("avx2.vbroadcast") || Name.startswith("avx512.pbroadcast") || @@ -1694,7 +1797,6 @@ Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts); Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()), Constant::getNullValue(MaskTy)); - if (CI->getNumArgOperands() == 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -35997,6 +35997,118 @@ return SDValue(); } +/// This function detects the addition or substraction with saturation pattern +/// between 2 unsigned i8/i16 vectors and replace this operation with the +/// efficient X86ISD::ADDUS/X86ISD::ADDS/X86ISD::SUBUS/x86ISD::SUBS instruction. +static SDValue detectAddSubSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + const SDLoc &DL) { + if (!VT.isVector()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if ((ScalarVT != MVT::i8 && ScalarVT != MVT::i16) || + InVT.getSizeInBits() % 128 != 0 || !isPowerOf2_32(NumElems)) + return SDValue(); + + // InScalarVT is the intermediate type in AddSubSat pattern + // and it should be greater than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (!Subtarget.hasSSE2()) + return SDValue(); + + // Detect the following pattern: + // %2 = zext <16 x i8> %0 to <16 x i16> + // %3 = zext <16 x i8> %1 to <16 x i16> + // %4 = add nuw nsw <16 x i16> %3, %2 + // %5 = icmp ult <16 x i16> %4, <16 x i16> (vector of max InScalarVT values) + // %6 = select <16 x i1> %5, <16 x i16> (vector of max InScalarVT values) + // %7 = trunc <16 x i16> %6 to <16 x i8> + + // Detect a Sat Pattern + bool Signed = true; + SDValue Sat = detectSSatPattern(In, VT, false); + if (!Sat) { + Sat = detectUSatPattern(In, VT); + Signed = false; + } + if (!Sat) + return SDValue(); + if (Sat.getOpcode() != ISD::ADD && Sat.getOpcode() != ISD::SUB) + return SDValue(); + + unsigned Opcode = Sat.getOpcode() == ISD::ADD ? Signed ? X86ISD::ADDS + : X86ISD::ADDUS + : Signed ? X86ISD::SUBS + : X86ISD::SUBUS; + + // Get addition elements. + SDValue LHS = Sat.getOperand(0); + SDValue RHS = Sat.getOperand(1); + + // Check if Op is a result of type promotion. + auto IsExtended = [=, &DAG](SDValue Op) { + unsigned Opcode = Op.getOpcode(); + unsigned EltSize = ScalarVT.getSizeInBits(); + unsigned ExtEltSize = InScalarVT.getSizeInBits(); + unsigned ExtPartSize = ExtEltSize - EltSize; + + // Extension of non-constant operand. + if (Opcode == ISD::ZERO_EXTEND || Opcode == ISD::SIGN_EXTEND) { + if (Signed) + return DAG.ComputeNumSignBits(Op) > ExtPartSize; + else { + return DAG.MaskedValueIsZero(Op, APInt::getHighBitsSet(ExtEltSize, ExtPartSize)); + } + // Build vector of constant nodes. Each of them needs to be a correct + // extension from a constant of ScalarVT type. + } else if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + unsigned NumOperands = Op.getNumOperands(); + for (unsigned i = 0; i < NumOperands; ++i) { + APInt Elt = cast(Op.getOperand(i))->getAPIntValue(); + Elt = Elt.getHiBits(Signed ? ExtPartSize + 1 : ExtPartSize); + if ((Signed && (!Elt.isAllOnesValue() && !Elt.isNullValue())) || + (!Signed && !Elt.isNullValue())) + return false; + } + return true; + } + return false; + }; + + // Either both operands are extended or one of them is extended + // and another one is a vector of constants. + if (!IsExtended(LHS) || !IsExtended(RHS)) + return SDValue(); + + // At this point it's guaranteed that at most one of them is + // a vector of constants (otherwise sat pattern wouldn't have been detected). + if (RHS.getOpcode() == ISD::BUILD_VECTOR) { + LHS = LHS.getOperand(0); + RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); + } else if (LHS.getOpcode() == ISD::BUILD_VECTOR) { + RHS = RHS.getOperand(0); + LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); + } else { + LHS = LHS.getOperand(0); + RHS = RHS.getOperand(0); + } + + // The pattern is detected, emit ADDS/ADDUS/SUBS/SUBUS instruction. + auto AddSubSatBuilder = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + EVT VT = Ops[0].getValueType(); + return DAG.getNode(Opcode, DL, VT, Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { LHS, RHS }, + AddSubSatBuilder); +} + static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -36011,6 +36123,10 @@ if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to detect addition or subtraction with saturation. + if (SDValue AddSubSat = detectAddSubSatPattern(Src, VT, DAG, Subtarget, DL)) + return AddSubSat; + // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -402,10 +402,6 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -444,10 +440,6 @@ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), @@ -803,18 +795,6 @@ X86ISD::FMULS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMULS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK, @@ -981,18 +961,6 @@ X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, @@ -1602,10 +1570,6 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1627,10 +1591,6 @@ X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -98,11 +98,17 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = sext <32 x i8> %arg0 to <32 x i16> + %2 = sext <32 x i8> %arg1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %bc = bitcast <32 x i8> %8 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epi16: @@ -111,11 +117,17 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = sext <16 x i16> %arg0 to <16 x i32> + %2 = sext <16 x i16> %arg1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %bc = bitcast <16 x i16> %8 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu8: @@ -124,11 +136,15 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = zext <32 x i8> %arg0 to <32 x i16> + %2 = zext <32 x i8> %arg1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + %bc = bitcast <32 x i8> %6 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu16: @@ -137,11 +153,15 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = zext <16 x i16> %arg0 to <16 x i32> + %2 = zext <16 x i16> %arg1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + %bc = bitcast <16 x i16> %6 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_alignr_epi8: @@ -2529,11 +2549,17 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = sext <32 x i8> %arg0 to <32 x i16> + %2 = sext <32 x i8> %arg1 to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %bc = bitcast <32 x i8> %8 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epi16: @@ -2542,37 +2568,47 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = sext <16 x i16> %arg0 to <16 x i32> + %2 = sext <16 x i16> %arg1 to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %bc = bitcast <16 x i16> %8 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp ugt <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sub = sub <32 x i8> %sel, %arg1 + %bc = bitcast <32 x i8> %sub to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp ugt <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sub = sub <16 x i16> %sel, %arg1 + %bc = bitcast <16 x i16> %sub to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpackhi_epi8: Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -848,6 +848,133 @@ declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_padds_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_padds_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_paddus_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_paddus_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubs_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubs_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubus_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubus_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone + define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) { ; X86-LABEL: test_x86_avx2_pmulu_dq: ; X86: ## %bb.0: Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -181,110 +181,6 @@ } -define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_padds_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_padds_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_padds_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_padds_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_padds_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_padds_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_padds_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_padds_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_paddus_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_paddus_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_paddus_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_paddus_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone - - define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; X86-AVX-LABEL: test_x86_avx2_pmadd_wd: ; X86-AVX: ## %bb.0: @@ -927,109 +823,6 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone -define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubs_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubs_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubs_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubs_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubus_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubus_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubus_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubus_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone - define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { ; X86-LABEL: test_x86_avx2_phadd_d: ; X86: ## %bb.0: @@ -1330,29 +1123,29 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI46_0, kind: FK_Data_4 +; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: vmovaps LCPI46_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI46_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI46_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI46_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res } @@ -2071,37 +1864,37 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI78_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4 +; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI78_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI78_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res } @@ -2136,37 +1929,37 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 -; X86-AVX-NEXT: retl ## encoding: [0xc3] +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI80_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4 +; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI80_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI80_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: retq ## encoding: [0xc3] +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res } Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2694,6 +2694,422 @@ ret <32 x i16> %res2 } +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -579,422 +579,6 @@ declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) -define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -3947,6 +3947,1046 @@ ret <16 x i16> %res2 } +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -758,1046 +758,6 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) -define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epi8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epi8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epi8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epi8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -112,11 +112,17 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = sext <16 x i8> %arg0 to <16 x i16> + %2 = sext <16 x i8> %arg1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %bc = bitcast <16 x i8> %8 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epi16: @@ -130,11 +136,17 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = sext <8 x i16> %arg0 to <8 x i32> + %2 = sext <8 x i16> %arg1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %bc = bitcast <8 x i16> %8 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epu8: @@ -148,11 +160,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = zext <16 x i8> %arg0 to <16 x i16> + %2 = zext <16 x i8> %arg1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + %bc = bitcast <16 x i8> %6 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epu16: @@ -166,11 +182,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = zext <8 x i16> %arg0 to <8 x i32> + %2 = zext <8 x i16> %arg1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + %bc = bitcast <8 x i16> %6 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_and_pd: @@ -3507,11 +3527,17 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = sext <16 x i8> %arg0 to <16 x i16> + %2 = sext <16 x i8> %arg1 to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %bc = bitcast <16 x i8> %8 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epi16: @@ -3525,47 +3551,69 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = sext <8 x i16> %arg0 to <8 x i32> + %2 = sext <8 x i16> %arg1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %bc = bitcast <8 x i16> %8 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epu8: ; X32: # %bb.0: -; X32-NEXT: psubusb %xmm1, %xmm0 +; X32-NEXT: pmaxub %xmm1, %xmm0 +; X32-NEXT: psubb %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_subs_epu8: ; X64: # %bb.0: -; X64-NEXT: psubusb %xmm1, %xmm0 +; X64-NEXT: pmaxub %xmm1, %xmm0 +; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %cmp = icmp ugt <16 x i8> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sub = sub <16 x i8> %sel, %arg1 + %bc = bitcast <16 x i8> %sub to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epu16: ; X32: # %bb.0: -; X32-NEXT: psubusw %xmm1, %xmm0 +; X32-NEXT: movdqa .LCPI190_0, %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X32-NEXT: movdqa %xmm1, %xmm3 +; X32-NEXT: pxor %xmm2, %xmm3 +; X32-NEXT: pxor %xmm2, %xmm0 +; X32-NEXT: pmaxsw %xmm3, %xmm0 +; X32-NEXT: pxor %xmm2, %xmm0 +; X32-NEXT: psubw %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_subs_epu16: ; X64: # %bb.0: -; X64-NEXT: psubusw %xmm1, %xmm0 +; X64-NEXT: movdqa .LCPI190_0(%rip), %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm0 +; X64-NEXT: pmaxsw %xmm3, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm0 +; X64-NEXT: psubw %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %cmp = icmp ugt <8 x i16> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sub = sub <8 x i16> %sel, %arg1 + %bc = bitcast <8 x i16> %sub to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_ucomieq_sd: Index: test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -246,6 +246,172 @@ } declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone +define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_sse2_pmulu_dq: Index: test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86.ll +++ test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -882,90 +882,6 @@ } -define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_padds_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_padds_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_padds_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_padds_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_padds_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_padds_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_paddus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_paddus_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_paddus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_paddus_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone - - define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## %bb.0: @@ -1486,90 +1402,6 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone -define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubs_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubs_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubs_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubs_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubus_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubus_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone - - define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { ; SSE-LABEL: test_x86_sse2_sqrt_pd: ; SSE: ## %bb.0: Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -7122,6 +7122,7 @@ } declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone + define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_paddsw: ; GENERIC: # %bb.0: @@ -7228,12 +7229,25 @@ ; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) - %2 = load <8 x i16>, <8 x i16> *%a2, align 16 - %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) - ret <8 x i16> %3 + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = load <8 x i16>, <8 x i16> *%a2, align 16 + %10 = sext <8 x i16> %8 to <8 x i32> + %11 = sext <8 x i16> %9 to <8 x i32> + %12 = add nsw <8 x i32> %10, %11 + %13 = icmp slt <8 x i32> %12, + %14 = select <8 x i1> %13, <8 x i32> %12, <8 x i32> + %15 = icmp sgt <8 x i32> %14, + %16 = select <8 x i1> %15, <8 x i32> %14, <8 x i32> + %17 = trunc <8 x i32> %16 to <8 x i16> + ret <8 x i16> %17 } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; GENERIC-LABEL: test_paddusb: @@ -7341,12 +7355,21 @@ ; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) - %2 = load <16 x i8>, <16 x i8> *%a2, align 16 - %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) - ret <16 x i8> %3 + %1 = zext <16 x i8> %a0 to <16 x i16> + %2 = zext <16 x i8> %a1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + %7 = load <16 x i8>, <16 x i8> *%a2, align 16 + %8 = zext <16 x i8> %6 to <16 x i16> + %9 = zext <16 x i8> %7 to <16 x i16> + %10 = add nsw <16 x i16> %8, %9 + %11 = icmp ult <16 x i16> %10, + %12 = select <16 x i1> %11, <16 x i16> %10, <16 x i16> + %13 = trunc <16 x i16> %12 to <16 x i8> + ret <16 x i8> %13 } -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_paddusw: @@ -7454,12 +7477,21 @@ ; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) - %2 = load <8 x i16>, <8 x i16> *%a2, align 16 - %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) - ret <8 x i16> %3 + %1 = zext <8 x i16> %a0 to <8 x i32> + %2 = zext <8 x i16> %a1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + %7 = load <8 x i16>, <8 x i16> *%a2, align 16 + %8 = zext <8 x i16> %6 to <8 x i32> + %9 = zext <8 x i16> %7 to <8 x i32> + %10 = add nsw <8 x i32> %8, %9 + %11 = icmp ult <8 x i32> %10, + %12 = select <8 x i1> %11, <8 x i32> %10, <8 x i32> + %13 = trunc <8 x i32> %12 to <8 x i16> + ret <8 x i16> %13 } -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_paddw: @@ -12585,12 +12617,25 @@ ; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) - %2 = load <8 x i16>, <8 x i16> *%a2, align 16 - %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) - ret <8 x i16> %3 + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = load <8 x i16>, <8 x i16> *%a2, align 16 + %10 = sext <8 x i16> %8 to <8 x i32> + %11 = sext <8 x i16> %9 to <8 x i32> + %12 = sub nsw <8 x i32> %10, %11 + %13 = icmp slt <8 x i32> %12, + %14 = select <8 x i1> %13, <8 x i32> %12, <8 x i32> + %15 = icmp sgt <8 x i32> %14, + %16 = select <8 x i1> %15, <8 x i32> %14, <8 x i32> + %17 = trunc <8 x i32> %16 to <8 x i16> + ret <8 x i16> %17 } -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; GENERIC-LABEL: test_psubusb: @@ -12698,12 +12743,15 @@ ; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) - %2 = load <16 x i8>, <16 x i8> *%a2, align 16 - %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) - ret <16 x i8> %3 + %1 = icmp ugt <16 x i8> %a0, %a1 + %2 = select <16 x i1> %1, <16 x i8> %a0, <16 x i8> %a1 + %3 = sub <16 x i8> %2, %a1 + %4 = load <16 x i8>, <16 x i8> *%a2, align 16 + %5 = icmp ugt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = sub <16 x i8> %6, %4 + ret <16 x i8> %7 } -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psubusw: @@ -12811,12 +12859,15 @@ ; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) - %2 = load <8 x i16>, <8 x i16> *%a2, align 16 - %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) - ret <8 x i16> %3 + %1 = icmp ugt <8 x i16> %a0, %a1 + %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> %a1 + %3 = sub <8 x i16> %2, %a1 + %4 = load <8 x i16>, <8 x i16> *%a2, align 16 + %5 = icmp ugt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = sub <8 x i16> %6, %4 + ret <8 x i16> %7 } -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psubw: Index: test/CodeGen/X86/vector-arith-sat.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vector-arith-sat.ll @@ -0,0 +1,3021 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding | FileCheck %s +; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE + +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_padds_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %1 = sext <32 x i8> %a0 to <32 x i16> + %2 = sext <32 x i8> %a1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + + +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_padds_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %1 = sext <16 x i16> %a0 to <16 x i32> + %2 = sext <16 x i16> %a1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + + +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_mask_adds_epi16_rr_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 +; X86-AVX-NEXT: retl +; +; X86-AVX512VL-LABEL: test_mask_adds_epi16_rr_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; X86-AVX512VL-NEXT: retl +; +; X64-AVX-LABEL: test_mask_adds_epi16_rr_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512VL-LABEL: test_mask_adds_epi16_rr_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: retq + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + + +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_paddus_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %1 = zext <32 x i8> %a0 to <32 x i16> + %2 = zext <32 x i8> %a1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + ret <32 x i8> %6 +} + + +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_paddus_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %1 = zext <16 x i16> %a0 to <16 x i32> + %2 = zext <16 x i16> %a1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + ret <16 x i16> %6 +} + + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_mask_adds_epu16_rr_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 +; X86-AVX-NEXT: retl +; +; X86-AVX512VL-LABEL: test_mask_adds_epu16_rr_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; X86-AVX512VL-NEXT: retl +; +; X64-AVX-LABEL: test_mask_adds_epu16_rr_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512VL-LABEL: test_mask_adds_epu16_rr_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: retq + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + ret <32 x i16> %6 +} + +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubs_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %1 = sext <32 x i8> %a0 to <32 x i16> + %2 = sext <32 x i8> %a1 to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + + +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubs_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %1 = sext <16 x i16> %a0 to <16 x i32> + %2 = sext <16 x i16> %a1 to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_mask_subs_epi16_rr_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 +; X86-AVX-NEXT: retl +; +; X86-AVX512VL-LABEL: test_mask_subs_epi16_rr_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; X86-AVX512VL-NEXT: retl +; +; X64-AVX-LABEL: test_mask_subs_epi16_rr_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512VL-LABEL: test_mask_subs_epi16_rr_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: retq + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + + +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubus_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %cmp = icmp ugt <32 x i8> %a0, %a1 + %sel = select <32 x i1> %cmp, <32 x i8> %a0, <32 x i8> %a1 + %sub = sub <32 x i8> %sel, %a1 + ret <32 x i8> %sub +} + + +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: test_x86_avx2_psubus_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} + %cmp = icmp ugt <16 x i16> %a0, %a1 + %sel = select <16 x i1> %cmp, <16 x i16> %a0, <16 x i16> %a1 + %sub = sub <16 x i16> %sel, %a1 + ret <16 x i16> %sub +} + + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_mask_subs_epu16_rr_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 +; X86-AVX-NEXT: retl +; +; X86-AVX512VL-LABEL: test_mask_subs_epu16_rr_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; X86-AVX512VL-NEXT: retl +; +; X64-AVX-LABEL: test_mask_subs_epu16_rr_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512VL-LABEL: test_mask_subs_epu16_rr_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: retq + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_adds_epi16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512_avx512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512_avx512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: retl + %1 = sext <64 x i16> %a to <64 x i32> + %2 = sext <64 x i16> %b to <64 x i32> + %3 = add nsw <64 x i32> %1, %2 + %4 = icmp slt <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = icmp sgt <64 x i32> %5, + %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> + %8 = trunc <64 x i32> %7 to <64 x i16> + ret <64 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512_avx512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512_avx512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: retl + %1 = sext <64 x i16> %a to <64 x i32> + %2 = sext <64 x i16> %b to <64 x i32> + %3 = sub nsw <64 x i32> %1, %2 + %4 = icmp slt <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = icmp sgt <64 x i32> %5, + %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> + %8 = trunc <64 x i32> %7 to <64 x i16> + ret <64 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epu16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512_avx512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512_avx512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + ret <32 x i16> %6 +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> %passThru + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> zeroinitializer + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + ret <32 x i16> %6 +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> %passThru + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = zext <32 x i16> %a to <32 x i32> + %2 = zext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp ult <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = trunc <32 x i32> %5 to <32 x i16> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> zeroinitializer + ret <32 x i16> %8 +} + +define <64 x i16> @test_mask_adds_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddusw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddusw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: retl + %1 = zext <64 x i16> %a to <64 x i32> + %2 = zext <64 x i16> %b to <64 x i32> + %3 = add nsw <64 x i32> %1, %2 + %4 = icmp ult <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = trunc <64 x i32> %5 to <64 x i16> + ret <64 x i16> %6 +} + +define <32 x i16> @test_mask_subs_epu16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512_avx512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512_avx512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <64 x i16> @test_mask_subs_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubusw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubusw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <64 x i16> %a, %b + %sel = select <64 x i1> %cmp, <64 x i16> %a, <64 x i16> %b + %sub = sub <64 x i16> %sel, %b + ret <64 x i16> %sub +} + +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + ret <8 x i16> %6 +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + %7 = bitcast i8 %mask to <8 x i1> + %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> %passThru + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + %7 = bitcast i8 %mask to <8 x i1> + %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> zeroinitializer + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + ret <8 x i16> %6 +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + %7 = bitcast i8 %mask to <8 x i1> + %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> %passThru + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + %7 = bitcast i8 %mask to <8 x i1> + %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> zeroinitializer + ret <8 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + ret <16 x i16> %6 +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> %passThru + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> zeroinitializer + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + ret <16 x i16> %6 +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> %passThru + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = zext <16 x i16> %a to <16 x i32> + %2 = zext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp ult <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = trunc <16 x i32> %5 to <16 x i16> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> zeroinitializer + ret <16 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = zext <16 x i8> %a to <16 x i16> + %2 = zext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + ret <16 x i8> %6 +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = zext <16 x i8> %a to <16 x i16> + %2 = zext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> %passThru + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = zext <16 x i8> %a to <16 x i16> + %2 = zext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> zeroinitializer + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = zext <16 x i8> %a to <16 x i16> + %2 = zext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + ret <16 x i8> %6 +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = zext <16 x i8> %a to <16 x i16> + %2 = zext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> %passThru + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = zext <16 x i8> %a to <16 x i16> + %2 = zext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + %7 = bitcast i16 %mask to <16 x i1> + %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> zeroinitializer + ret <16 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = zext <32 x i8> %a to <32 x i16> + %2 = zext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + ret <32 x i8> %6 +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = zext <32 x i8> %a to <32 x i16> + %2 = zext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> %passThru + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = zext <32 x i8> %a to <32 x i16> + %2 = zext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> zeroinitializer + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = zext <32 x i8> %a to <32 x i16> + %2 = zext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + ret <32 x i8> %6 +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = zext <32 x i8> %a to <32 x i16> + %2 = zext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> %passThru + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = zext <32 x i8> %a to <32 x i16> + %2 = zext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp ult <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = trunc <32 x i16> %5 to <32 x i8> + %7 = bitcast i32 %mask to <32 x i1> + %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> zeroinitializer + ret <32 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: retq + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = sext <16 x i8> %a0 to <16 x i16> + %2 = sext <16 x i8> %a1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + + +define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + + +define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = zext <16 x i8> %a0 to <16 x i16> + %2 = zext <16 x i8> %a1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp ult <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = trunc <16 x i16> %5 to <16 x i8> + ret <16 x i8> %6 +} + + +define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = zext <8 x i16> %a0 to <8 x i32> + %2 = zext <8 x i16> %a1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp ult <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = trunc <8 x i32> %5 to <8 x i16> + ret <8 x i16> %6 +} + +define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = sext <16 x i8> %a0 to <16 x i16> + %2 = sext <16 x i8> %a1 to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + + +define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + + +define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 +; SSE-NEXT: retl + %cmp = icmp ugt <16 x i8> %a0, %a1 + %sel = select <16 x i1> %cmp, <16 x i8> %a0, <16 x i8> %a1 + %sub = sub <16 x i8> %sel, %a1 + ret <16 x i8> %sub +} + + +define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 +; SSE-NEXT: retl + %cmp = icmp ugt <8 x i16> %a0, %a1 + %sel = select <8 x i1> %cmp, <8 x i16> %a0, <8 x i16> %a1 + %sub = sub <8 x i16> %sel, %a1 + ret <8 x i16> %sub +} + +define <8 x i8> @test_x86_sse2_padds_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; AVX512BW-LABEL: test_x86_sse2_padds_b_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_padds_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psllw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pminsw LCPI144_0, %xmm0 +; SSE-NEXT: pmaxsw LCPI144_1, %xmm0 +; SSE-NEXT: retl + %1 = sext <8 x i8> %a0 to <8 x i16> + %2 = sext <8 x i8> %a1 to <8 x i16> + %3 = add nsw <8 x i16> %1, %2 + %4 = icmp slt <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = icmp sgt <8 x i16> %5, + %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> + %8 = trunc <8 x i16> %7 to <8 x i8> + ret <8 x i8> %8 +} + +define <4 x i16> @test_x86_sse2_padds_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; AVX512BW-LABEL: test_x86_sse2_padds_w_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_padds_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retl + %1 = sext <4 x i16> %a0 to <4 x i32> + %2 = sext <4 x i16> %a1 to <4 x i32> + %3 = add nsw <4 x i32> %1, %2 + %4 = icmp slt <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = icmp sgt <4 x i32> %5, + %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> + %8 = trunc <4 x i32> %7 to <4 x i16> + ret <4 x i16> %8 +} + + +define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; AVX512BW-LABEL: test_x86_sse2_paddus_b_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_paddus_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pminsw LCPI146_0, %xmm0 +; SSE-NEXT: retl + %1 = zext <8 x i8> %a0 to <8 x i16> + %2 = zext <8 x i8> %a1 to <8 x i16> + %3 = add nsw <8 x i16> %1, %2 + %4 = icmp ult <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = trunc <8 x i16> %5 to <8 x i8> + ret <8 x i8> %6 +} + + +define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; AVX512BW-LABEL: test_x86_sse2_paddus_w_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_paddus_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retl + %1 = zext <4 x i16> %a0 to <4 x i32> + %2 = zext <4 x i16> %a1 to <4 x i32> + %3 = add nsw <4 x i32> %1, %2 + %4 = icmp ult <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = trunc <4 x i32> %5 to <4 x i16> + ret <4 x i16> %6 +} + +define <8 x i8> @test_x86_sse2_psubs_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; AVX512BW-LABEL: test_x86_sse2_psubs_b_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_psubs_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psllw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: pminsw LCPI148_0, %xmm0 +; SSE-NEXT: pmaxsw LCPI148_1, %xmm0 +; SSE-NEXT: retl + %1 = sext <8 x i8> %a0 to <8 x i16> + %2 = sext <8 x i8> %a1 to <8 x i16> + %3 = sub nsw <8 x i16> %1, %2 + %4 = icmp slt <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = icmp sgt <8 x i16> %5, + %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> + %8 = trunc <8 x i16> %7 to <8 x i8> + ret <8 x i8> %8 +} + + +define <4 x i16> @test_x86_sse2_psubs_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; AVX512BW-LABEL: test_x86_sse2_psubs_w_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_psubs_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retl + %1 = sext <4 x i16> %a0 to <4 x i32> + %2 = sext <4 x i16> %a1 to <4 x i32> + %3 = sub nsw <4 x i32> %1, %2 + %4 = icmp slt <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = icmp sgt <4 x i32> %5, + %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> + %8 = trunc <4 x i32> %7 to <4 x i16> + ret <4 x i16> %8 +} + + +define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; AVX512BW-LABEL: test_x86_sse2_psubus_b_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_psubus_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pmaxsw %xmm3, %xmm0 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: retl + %cmp = icmp ugt <8 x i8> %a0, %a1 + %sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1 + %sub = sub <8 x i8> %sel, %a1 + ret <8 x i8> %sub +} + + +define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; AVX512BW-LABEL: test_x86_sse2_psubus_w_64: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512BW-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; SSE-LABEL: test_x86_sse2_psubus_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: psubd %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retl + %cmp = icmp ugt <4 x i16> %a0, %a1 + %sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1 + %sub = sub <4 x i16> %sel, %a1 + ret <4 x i16> %sub +} Index: test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll =================================================================== --- test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll +++ test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll @@ -46,14 +46,14 @@ ; Check that shadow is OR'ed, and origin is Select'ed ; And no shadow checks! -define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory { - %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) +define <8 x i16> @Pmulhuw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory { + %call = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) ret <8 x i16> %call } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind +declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) nounwind -; CHECK-LABEL: @Paddsw128 +; CHECK-LABEL: @Pmulhuw128 ; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls ; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls ; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls @@ -62,7 +62,7 @@ ; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128 ; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0 ; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32 -; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w +; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.pmulhu.w ; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls ; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls ; CHECK-NEXT: ret <8 x i16>