Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -364,28 +364,16 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">, + def int_x86_sse2_padds_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">, + def int_x86_sse2_padds_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">, + def int_x86_sse2_psubs_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">, + def int_x86_sse2_psubs_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">, @@ -1358,28 +1346,16 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">, + def int_x86_avx2_padds_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">, + def int_x86_avx2_padds_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; - def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">, + def int_x86_avx2_psubs_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">, + def int_x86_avx2_psubs_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">, @@ -3683,7 +3659,7 @@ def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">, + def int_x86_avx512_mask_padds_b_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic @@ -3692,25 +3668,7 @@ def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">, + def int_x86_avx512_mask_padds_w_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic @@ -3719,7 +3677,7 @@ def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">, + def int_x86_avx512_mask_psubs_b_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic @@ -3728,25 +3686,7 @@ def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, - llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, - llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, - llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">, + def int_x86_avx512_mask_psubs_w_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -71,7 +71,13 @@ // like to use this information to remove upgrade code for some older // intrinsics. It is currently undecided how we will determine that future // point. - if (Name=="ssse3.pabs.b.128" || // Added in 6.0 + if (Name.startswith("sse2.paddus") || // Added in 7.0 + Name.startswith("sse2.psubus") || // Added in 7.0 + Name.startswith("avx2.paddus") || // Added in 7.0 + Name.startswith("avx2.psubus") || // Added in 7.0 + Name.startswith("avx512.mask.paddus") || // Added in 7.0 + Name.startswith("avx512.mask.psubus") || // Added in 7.0 + Name=="ssse3.pabs.b.128" || // Added in 6.0 Name=="ssse3.pabs.w.128" || // Added in 6.0 Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("fma4.vfmadd.s") || // Added in 7.0 @@ -899,6 +905,37 @@ return EmitX86Select(Builder, Mask, Align, Passthru); } +static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder, CallInst &CI, + bool IsAddition) { + Value *Op0 = CI.getOperand(0); + Value *Op1 = CI.getOperand(1); + + // Collect vector elements and type data. + Type *ResultType = CI.getType(); + + Value *Res; + if (IsAddition) { + // ADDUS: a > (a+b) ? ~0 : (a+b) + // If Op0 > Add, overflow occured. + Value *Add = Builder.CreateAdd(Op0, Op1); + Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Add); + Value *Max = llvm::Constant::getAllOnesValue(ResultType); + Res = Builder.CreateSelect(ICmp, Max, Add); + } else { + // SUBUS: max(a, b) - b + Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Op1); + Value *Select = Builder.CreateSelect(ICmp, Op0, Op1); + Res = Builder.CreateSub(Select, Op1); + } + + if (CI.getNumArgOperands() == 4) { // For masked intrinsics. + Value *VecSrc = CI.getOperand(2); + Value *Mask = CI.getOperand(3); + Res = EmitX86Select(Builder, Mask, Res, VecSrc); + } + return Res; +} + static Value *UpgradeMaskedStore(IRBuilder<> &Builder, Value *Ptr, Value *Data, Value *Mask, bool Aligned) { @@ -2059,7 +2096,15 @@ if (CI->getNumArgOperands() == 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); - } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) { + } else if (IsX86 && (Name.startswith("sse2.paddus") || + Name.startswith("avx2.paddus") || + Name.startswith("avx512.mask.paddus"))) { + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, true /*IsAdd*/); + } else if (IsX86 && (Name.startswith("sse2.psubus") || + Name.startswith("avx2.psubus") || + Name.startswith("avx512.mask.psubus"))) { + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, false /*IsAdd*/); + }else if (IsX86 && Name.startswith("avx512.mask.palignr.")) { Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32932,6 +32932,52 @@ } } + // Match VSELECTs into add with unsigned saturation. + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) || + (Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || + VT == MVT::v16i32 || MVT::v8i64)))) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + + SDValue CondLHS = Cond->getOperand(0); + SDValue CondRHS = Cond->getOperand(1); + + // Canonicalize ADD to CondRHS to simplify the logic below. + if (CondLHS.getOpcode() == ISD::ADD) { + std::swap(CondLHS, CondRHS); + CC = ISD::getSetCCSwappedOperands(CC); + } + + // Check if one of the arms of the VSELECT is vector with all bits set. + // If it's on the left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllOnes(LHS.getNode())) { + Other = RHS; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { + Other = LHS; + } + + // We can test against either of the addition operands. + if (Other.getNode() && Other.getNumOperands() == 2 && + (Other.getOperand(0) == CondLHS || + Other.getOperand(1) == CondLHS)) { + SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); + + auto ADDUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::ADDUS, DL, Ops[0].getValueType(), Ops); + }; + + // x <= x+y ? x+y : ~0 --> addus x, y + if ((CC == ISD::SETULE) && + Other.getOpcode() == ISD::ADD && Other == CondRHS) + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, + ADDUSBuilder); + } + } + if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) return V; @@ -36745,6 +36791,114 @@ return DAG.getNode(Opc, DL, VT, LHS, RHS); } +/// This function detects the addition or subtraction with saturation pattern +/// between 2 i8/i16 vectors and replace this operation with the +/// efficient X86ISD::ADDUS/X86ISD::ADDS/X86ISD::SUBUS/X86ISD::SUBS instruction. +static SDValue detectAddSubSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + const SDLoc &DL) { + if (!VT.isVector()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if ((ScalarVT != MVT::i8 && ScalarVT != MVT::i16) || + InVT.getSizeInBits() % 128 != 0 || !isPowerOf2_32(NumElems)) + return SDValue(); + + // InScalarVT is the intermediate type in AddSubSat pattern + // and it should be greater than the output type. + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (!Subtarget.hasSSE2()) + return SDValue(); + + // Detect the following pattern: + // %2 = zext <16 x i8> %0 to <16 x i16> + // %3 = zext <16 x i8> %1 to <16 x i16> + // %4 = add nuw nsw <16 x i16> %3, %2 + // %5 = icmp ult <16 x i16> %4, <16 x i16> (vector of max InScalarVT values) + // %6 = select <16 x i1> %5, <16 x i16> (vector of max InScalarVT values) + // %7 = trunc <16 x i16> %6 to <16 x i8> + + // Detect a Sat Pattern + bool Signed = true; + SDValue Sat = detectSSatPattern(In, VT, false); + if (!Sat) { + Sat = detectUSatPattern(In, VT, DAG, DL); + Signed = false; + } + if (!Sat) + return SDValue(); + if (Sat.getOpcode() != ISD::ADD && Sat.getOpcode() != ISD::SUB) + return SDValue(); + + unsigned Opcode = Sat.getOpcode() == ISD::ADD ? Signed ? X86ISD::ADDS + : X86ISD::ADDUS + : Signed ? X86ISD::SUBS + : X86ISD::SUBUS; + + // Get addition elements. + SDValue LHS = Sat.getOperand(0); + SDValue RHS = Sat.getOperand(1); + + // Don't combine if both operands are constant. + if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && + ISD::isBuildVectorOfConstantSDNodes(LHS.getNode())) + return SDValue(); + + // Check if Op is a result of type promotion. + auto IsExtended = [=, &DAG](SDValue Op) { + unsigned Opcode = Op.getOpcode(); + unsigned EltSize = ScalarVT.getSizeInBits(); + unsigned ExtEltSize = InScalarVT.getSizeInBits(); + unsigned ExtPartSize = ExtEltSize - EltSize; + + // Extension of non-constant operand. + if (Opcode == ISD::ZERO_EXTEND || Opcode == ISD::SIGN_EXTEND) { + if (Signed) + return DAG.ComputeNumSignBits(Op) > ExtPartSize; + else { + APInt HighBitsMask = APInt::getHighBitsSet(ExtEltSize, ExtPartSize); + return DAG.MaskedValueIsZero(Op, HighBitsMask); + } + } else if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + // Build vector of constant nodes. Each of them needs to be a correct + // extension from a constant of ScalarVT type. + unsigned NumOperands = Op.getNumOperands(); + for (unsigned i = 0; i < NumOperands; ++i) { + APInt Elt = cast(Op.getOperand(i))->getAPIntValue(); + if ((Signed && Elt.isSignedIntN(EltSize)) || + (!Signed && Elt.isIntN(EltSize))) + return false; + } + return true; + } + return false; + }; + + // Either both operands are extended or one of them is extended + // and another one is a vector of constants. + if (!IsExtended(LHS) || !IsExtended(RHS)) + return SDValue(); + + // Truncate extended nodes to result type. + LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); + RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); + + // The pattern is detected, emit ADDS/ADDUS/SUBS/SUBUS instruction. + auto AddSubSatBuilder = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + EVT VT = Ops[0].getValueType(); + return DAG.getNode(Opcode, DL, VT, Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { LHS, RHS }, + AddSubSatBuilder); +} + static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -36759,6 +36913,10 @@ if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to detect addition or subtraction with saturation. + if (SDValue AddSubSat = detectAddSubSatPattern(Src, VT, DAG, Subtarget, DL)) + return AddSubSat; + // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -327,8 +327,6 @@ X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -371,8 +369,6 @@ X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -683,12 +679,6 @@ X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, @@ -809,12 +799,6 @@ X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), @@ -1186,8 +1170,6 @@ X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1211,8 +1193,6 @@ X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), Index: test/CodeGen/X86/avx2-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2-intrinsics-canonical.ll @@ -0,0 +1,357 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_padds_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a0 to <32 x i16> + %2 = sext <32 x i8> %a1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_padds_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a0 to <16 x i32> + %2 = sext <16 x i16> %a1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a0, %a1 + %2 = icmp ugt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + ret <32 x i8> %3 +} + +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a0, %a1 + %2 = icmp ugt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + ret <16 x i16> %3 +} + +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubs_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a0 to <32 x i16> + %2 = sext <32 x i8> %a1 to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubs_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a0 to <16 x i32> + %2 = sext <16 x i16> %a1 to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a0, %a1 + %sel = select <32 x i1> %cmp, <32 x i8> %a0, <32 x i8> %a1 + %sub = sub <32 x i8> %sel, %a1 + ret <32 x i8> %sub +} + +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a0, %a1 + %sel = select <16 x i1> %cmp, <16 x i16> %a0, <16 x i16> %a1 + %sub = sub <16 x i16> %sel, %a1 + ret <16 x i16> %sub +} + +define <32 x i16> @test_x86_avx2_padds_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_padds_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc2] +; X86-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xed,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc2] +; X64-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xed,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_x86_avx2_psubs_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_psubs_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc2] +; X86-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xe9,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc2] +; X64-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xe9,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_x86_avx2_paddus_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_paddus_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc2] +; X86-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xdd,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc2] +; X64-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xdd,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + ret <32 x i16> %3 +} + +define <32 x i16> @test_x86_avx2_psubus_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_psubus_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc2] +; X86-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xd9,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc2] +; X64-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xd9,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -98,11 +98,18 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = sext <32 x i8> %arg0 to <32 x i16> + %2 = sext <32 x i8> %arg1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %bc = bitcast <32 x i8> %8 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epi16: @@ -111,11 +118,18 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = sext <16 x i16> %arg0 to <16 x i32> + %2 = sext <16 x i16> %arg1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %bc = bitcast <16 x i16> %8 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone + define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu8: @@ -124,11 +138,13 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = add <32 x i8> %arg0, %arg1 + %2 = icmp ugt <32 x i8> %arg0, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %bc = bitcast <32 x i8> %3 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu16: @@ -137,11 +153,12 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = add <16 x i16> %arg0, %arg1 + %2 = icmp ugt <16 x i16> %arg0, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %bc = bitcast <16 x i16> %3 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_alignr_epi8: @@ -2529,11 +2546,18 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = sext <32 x i8> %arg0 to <32 x i16> + %2 = sext <32 x i8> %arg1 to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %bc = bitcast <32 x i8> %8 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epi16: @@ -2542,37 +2566,49 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = sext <16 x i16> %arg0 to <16 x i32> + %2 = sext <16 x i16> %arg1 to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %bc = bitcast <16 x i16> %8 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone + define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp ugt <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sub = sub <32 x i8> %sel, %arg1 + %bc = bitcast <32 x i8> %sub to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp ugt <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sub = sub <16 x i16> %sel, %arg1 + %bc = bitcast <16 x i16> %sub to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpackhi_epi8: Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -878,3 +878,107 @@ ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -233,58 +233,6 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone -define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_paddus_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_paddus_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_paddus_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_paddus_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone - - define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; X86-AVX-LABEL: test_x86_avx2_pmadd_wd: ; X86-AVX: ## %bb.0: @@ -978,58 +926,6 @@ } declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone - -define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubus_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubus_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubus_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubus_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone - define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { ; X86-LABEL: test_x86_avx2_phadd_d: ; X86: ## %bb.0: @@ -1330,28 +1226,28 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: vmovaps LCPI50_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res @@ -2071,36 +1967,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI82_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI82_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI82_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -2136,36 +2032,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI84_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI84_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI84_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res Index: test/CodeGen/X86/avx512bw-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512bw-intrinsics-canonical.ll @@ -0,0 +1,675 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %passThru + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %passThru + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = sext <64 x i16> %a to <64 x i32> + %2 = sext <64 x i16> %b to <64 x i32> + %3 = add nsw <64 x i32> %1, %2 + %4 = icmp slt <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = icmp sgt <64 x i32> %5, + %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> + %8 = trunc <64 x i32> %7 to <64 x i16> + ret <64 x i16> %8 +} + +define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = sext <64 x i16> %a to <64 x i32> + %2 = sext <64 x i16> %b to <64 x i32> + %3 = sub nsw <64 x i32> %1, %2 + %4 = icmp slt <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = icmp sgt <64 x i32> %5, + %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> + %8 = trunc <64 x i32> %7 to <64 x i16> + ret <64 x i16> %8 +} + +define <64 x i16> @test_mask_adds_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddusw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddusw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = add <64 x i16> %a, %b + %2 = icmp ugt <64 x i16> %a, %1 + %3 = select <64 x i1> %2, <64 x i16> , <64 x i16> %1 + ret <64 x i16> %3 +} + +define <64 x i16> @test_mask_subs_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubusw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubusw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <64 x i16> %a, %b + %sel = select <64 x i1> %cmp, <64 x i16> %a, <64 x i16> %b + %sub = sub <64 x i16> %sel, %b + ret <64 x i16> %sub +} Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2888,3 +2888,211 @@ %res4 = add <32 x i16> %res3, %res2 ret <32 x i16> %res4 } + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -757,203 +757,6 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epu16_rrk_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0xd1] -; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rrk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0xd1] -; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; X86-LABEL: test_mask_adds_epu16_rrkz_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rrkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; X86-LABEL: test_mask_adds_epu16_rm_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rm_512: -; X64: # %bb.0: -; X64-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epu16_rmk_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rmk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0x0f] -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_adds_epu16_rmkz_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rmkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epu16_rrk_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0xd1] -; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rrk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0xd1] -; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; X86-LABEL: test_mask_subs_epu16_rrkz_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rrkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; X86-LABEL: test_mask_subs_epu16_rm_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rm_512: -; X64: # %bb.0: -; X64-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epu16_rmk_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rmk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0x0f] -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_subs_epu16_rmkz_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rmkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: Index: test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll @@ -0,0 +1,1572 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %passThru + ret <8 x i16> %5 +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer + ret <8 x i16> %5 +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %passThru + ret <8 x i16> %5 +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer + ret <8 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %passThru + ret <16 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer + ret <16 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %passThru + ret <16 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer + ret <16 x i16> %5 +} + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %passThru + ret <16 x i8> %5 +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> zeroinitializer + ret <16 x i8> %5 +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %passThru + ret <16 x i8> %5 +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> zeroinitializer + ret <16 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %passThru + ret <32 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> zeroinitializer + ret <32 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %passThru + ret <32 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> zeroinitializer + ret <32 x i8> %5 +} + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -6345,3 +6345,803 @@ %res4 = add <16 x i16> %res3, %res2 ret <16 x i16> %res4 } + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_mask_adds_epu16_rrk_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; X86-LABEL: test_mask_adds_epu16_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; X86-LABEL: test_mask_adds_epu16_rm_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddusw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rm_128: +; X64: # %bb.0: +; X64-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_mask_adds_epu16_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpaddusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; X86-LABEL: test_mask_adds_epu16_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpaddusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_mask_adds_epu16_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; X86-LABEL: test_mask_adds_epu16_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; X86-LABEL: test_mask_adds_epu16_rm_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddusw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rm_256: +; X64: # %bb.0: +; X64-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_mask_adds_epu16_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddusw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; X86-LABEL: test_mask_adds_epu16_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddusw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu16_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_mask_subs_epu16_rrk_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; X86-LABEL: test_mask_subs_epu16_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; X86-LABEL: test_mask_subs_epu16_rm_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubusw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rm_128: +; X64: # %bb.0: +; X64-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_mask_subs_epu16_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpsubusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; X86-LABEL: test_mask_subs_epu16_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpsubusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_mask_subs_epu16_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; X86-LABEL: test_mask_subs_epu16_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; X86-LABEL: test_mask_subs_epu16_rm_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubusw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rm_256: +; X64: # %bb.0: +; X64-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_mask_subs_epu16_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubusw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; X86-LABEL: test_mask_subs_epu16_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubusw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu16_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_mask_adds_epu8_rrk_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; X86-LABEL: test_mask_adds_epu8_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; X86-LABEL: test_mask_adds_epu8_rm_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddusb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rm_128: +; X64: # %bb.0: +; X64-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_mask_adds_epu8_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddusb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; X86-LABEL: test_mask_adds_epu8_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddusb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_mask_adds_epu8_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; X86-LABEL: test_mask_adds_epu8_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; X86-LABEL: test_mask_adds_epu8_rm_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddusb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rm_256: +; X64: # %bb.0: +; X64-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_mask_adds_epu8_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddusb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; X86-LABEL: test_mask_adds_epu8_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddusb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epu8_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_mask_subs_epu8_rrk_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; X86-LABEL: test_mask_subs_epu8_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; X86-LABEL: test_mask_subs_epu8_rm_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubusb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rm_128: +; X64: # %bb.0: +; X64-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_mask_subs_epu8_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubusb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; X86-LABEL: test_mask_subs_epu8_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubusb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_mask_subs_epu8_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; X86-LABEL: test_mask_subs_epu8_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; X86-LABEL: test_mask_subs_epu8_rm_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubusb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rm_256: +; X64: # %bb.0: +; X64-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_mask_subs_epu8_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubusb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; X86-LABEL: test_mask_subs_epu8_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubusb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epu8_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1531,410 +1531,6 @@ declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) -define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_adds_epu16_rrk_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; X86-LABEL: test_mask_adds_epu16_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; X86-LABEL: test_mask_adds_epu16_rm_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rm_128: -; X64: # %bb.0: -; X64-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_adds_epu16_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; X86-LABEL: test_mask_adds_epu16_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epu16_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; X86-LABEL: test_mask_adds_epu16_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; X86-LABEL: test_mask_adds_epu16_rm_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rm_256: -; X64: # %bb.0: -; X64-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epu16_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_adds_epu16_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu16_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_subs_epu16_rrk_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; X86-LABEL: test_mask_subs_epu16_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; X86-LABEL: test_mask_subs_epu16_rm_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rm_128: -; X64: # %bb.0: -; X64-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_subs_epu16_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; X86-LABEL: test_mask_subs_epu16_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epu16_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; X86-LABEL: test_mask_subs_epu16_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; X86-LABEL: test_mask_subs_epu16_rm_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rm_256: -; X64: # %bb.0: -; X64-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epu16_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_subs_epu16_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu16_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: test_mask_adds_epi8_rr_128: ; CHECK: # %bb.0: @@ -2331,399 +1927,6 @@ declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) -define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epu8_rrk_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; X86-LABEL: test_mask_adds_epu8_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; X86-LABEL: test_mask_adds_epu8_rm_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rm_128: -; X64: # %bb.0: -; X64-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epu8_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_adds_epu8_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epu8_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; X86-LABEL: test_mask_adds_epu8_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; X86-LABEL: test_mask_adds_epu8_rm_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddusb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rm_256: -; X64: # %bb.0: -; X64-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epu8_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_adds_epu8_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddusb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epu8_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epu8_rrk_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; X86-LABEL: test_mask_subs_epu8_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; X86-LABEL: test_mask_subs_epu8_rm_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rm_128: -; X64: # %bb.0: -; X64-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epu8_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_subs_epu8_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epu8_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; X86-LABEL: test_mask_subs_epu8_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; X86-LABEL: test_mask_subs_epu8_rm_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubusb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rm_256: -; X64: # %bb.0: -; X64-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epu8_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_subs_epu8_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubusb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epu8_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) Index: test/CodeGen/X86/sse2-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -0,0 +1,664 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2 +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <16 x i8> %a0 to <16 x i16> + %2 = sext <16 x i8> %a1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <16 x i8> %a0, %a1 + %2 = icmp ugt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + ret <16 x i8> %3 +} + +define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <8 x i16> %a0, %a1 + %2 = icmp ugt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + ret <8 x i16> %3 +} + +define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <16 x i8> %a0 to <16 x i16> + %2 = sext <16 x i8> %a1 to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a0, %a1 + %sel = select <16 x i1> %cmp, <16 x i8> %a0, <16 x i8> %a1 + %sub = sub <16 x i8> %sel, %a1 + ret <16 x i8> %sub +} + +define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a0, %a1 + %sel = select <8 x i1> %cmp, <8 x i16> %a0, <8 x i16> %a1 + %sub = sub <8 x i16> %sel, %a1 + ret <8 x i16> %sub +} + +define <8 x i8> @test_x86_sse2_padds_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: psllw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x08] +; SSE-NEXT: psraw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x08] +; SSE-NEXT: psllw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xf1,0x08] +; SSE-NEXT: psraw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xe1,0x08] +; SSE-NEXT: paddw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfd,0xc1] +; SSE-NEXT: pminsw LCPI8_0, %xmm0 ## encoding: [0x66,0x0f,0xea,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; SSE-NEXT: pmaxsw LCPI8_1, %xmm0 ## encoding: [0x66,0x0f,0xee,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4 +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] +; AVX2-NEXT: vpminsw LCPI8_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsw LCPI8_1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4 +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpsllw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; SKX-NEXT: vpsraw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; SKX-NEXT: vpsllw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; SKX-NEXT: vpsraw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; SKX-NEXT: vpminsw LCPI8_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsw LCPI8_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i8> %a0 to <8 x i16> + %2 = sext <8 x i8> %a1 to <8 x i16> + %3 = add nsw <8 x i16> %1, %2 + %4 = icmp slt <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = icmp sgt <8 x i16> %5, + %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> + %8 = trunc <8 x i16> %7 to <8 x i8> + ret <8 x i8> %8 +} + +define <4 x i16> @test_x86_sse2_padds_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x10] +; SSE-NEXT: psrad $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x10] +; SSE-NEXT: pslld $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xf1,0x10] +; SSE-NEXT: psrad $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xe1,0x10] +; SSE-NEXT: paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI9_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd1] +; SSE-NEXT: pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca] +; SSE-NEXT: pandn %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd0] +; SSE-NEXT: por %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI9_1, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc1] +; SSE-NEXT: pand %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdf,0xc1] +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; AVX2-NEXT: vpslld $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI9_0, kind: FK_Data_4 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI9_1, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpslld $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; SKX-NEXT: vpsrad $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; SKX-NEXT: vpslld $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; SKX-NEXT: vpsrad $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; SKX-NEXT: vpminsd LCPI9_0{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x39,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI9_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsd LCPI9_1{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x3d,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI9_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <4 x i16> %a0 to <4 x i32> + %2 = sext <4 x i16> %a1 to <4 x i32> + %3 = add nsw <4 x i32> %1, %2 + %4 = icmp slt <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = icmp sgt <4 x i32> %5, + %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> + %8 = trunc <4 x i32> %7 to <4 x i16> + ret <4 x i16> %8 +} + +define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; SSE-NEXT: paddw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfd,0xc8] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1] +; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x65,0xc2] +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] +; SSE-NEXT: pand LCPI10_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0xdb,0xda] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0xdb,0xca] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x65,0xc9] +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc2,0x10] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vmovdqa LCPI10_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xda] +; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xca] +; SKX-NEXT: vpcmpnleuw %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x3e,0xc9,0x06] +; SKX-NEXT: vmovdqu16 LCPI10_0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI10_0, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <8 x i8> %a0, %a1 + %2 = icmp ugt <8 x i8> %a0, %1 + %3 = select <8 x i1> %2, <8 x i8> , <8 x i8> %1 + ret <8 x i8> %3 +} + +define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4 +; SSE-NEXT: paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1] +; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc2] +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] +; SSE-NEXT: pand LCPI11_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4 +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa] +; AVX2-NEXT: ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa] +; AVX2-NEXT: ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x66,0xc9] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x18,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI11_0, kind: FK_Data_4 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc2,0x10] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa] +; SKX-NEXT: ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa] +; SKX-NEXT: ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SKX-NEXT: vpcmpnleud %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1e,0xc9,0x06] +; SKX-NEXT: vpbroadcastd LCPI11_0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI11_0, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <4 x i16> %a0, %a1 + %2 = icmp ugt <4 x i16> %a0, %1 + %3 = select <4 x i1> %2, <4 x i16> , <4 x i16> %1 + ret <4 x i16> %3 +} + +define <8 x i8> @test_x86_sse2_psubs_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: psllw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x08] +; SSE-NEXT: psraw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x08] +; SSE-NEXT: psllw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xf1,0x08] +; SSE-NEXT: psraw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xe1,0x08] +; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1] +; SSE-NEXT: pminsw LCPI12_0, %xmm0 ## encoding: [0x66,0x0f,0xea,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4 +; SSE-NEXT: pmaxsw LCPI12_1, %xmm0 ## encoding: [0x66,0x0f,0xee,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4 +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1] +; AVX2-NEXT: vpminsw LCPI12_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsw LCPI12_1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4 +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpsllw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; SKX-NEXT: vpsraw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; SKX-NEXT: vpsllw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; SKX-NEXT: vpsraw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] +; SKX-NEXT: vpminsw LCPI12_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsw LCPI12_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i8> %a0 to <8 x i16> + %2 = sext <8 x i8> %a1 to <8 x i16> + %3 = sub nsw <8 x i16> %1, %2 + %4 = icmp slt <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = icmp sgt <8 x i16> %5, + %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> + %8 = trunc <8 x i16> %7 to <8 x i8> + ret <8 x i8> %8 +} + +define <4 x i16> @test_x86_sse2_psubs_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x10] +; SSE-NEXT: psrad $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x10] +; SSE-NEXT: pslld $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xf1,0x10] +; SSE-NEXT: psrad $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xe1,0x10] +; SSE-NEXT: psubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfa,0xc1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI13_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd1] +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd0] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] +; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI13_1, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc1] +; SSE-NEXT: pand %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdf,0xc1] +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; AVX2-NEXT: vpslld $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI13_0, kind: FK_Data_4 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI13_1, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpslld $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; SKX-NEXT: vpsrad $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; SKX-NEXT: vpslld $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; SKX-NEXT: vpsrad $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1] +; SKX-NEXT: vpminsd LCPI13_0{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x39,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI13_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsd LCPI13_1{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x3d,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI13_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <4 x i16> %a0 to <4 x i32> + %2 = sext <4 x i16> %a1 to <4 x i32> + %3 = sub nsw <4 x i32> %1, %2 + %4 = icmp slt <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = icmp sgt <4 x i32> %5, + %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> + %8 = trunc <4 x i32> %7 to <4 x i16> + ret <4 x i16> %8 +} + +define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9] +; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pmaxsw %xmm3, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc3] +; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2] +; AVX2-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vmovdqa LCPI14_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4 +; SKX-NEXT: vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda] +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2] +; SKX-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3] +; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <8 x i8> %a0, %a1 + %sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1 + %sub = sub <8 x i8> %sel, %a1 + ret <8 x i8> %sub +} + +define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI15_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9] +; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3] +; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] +; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1] +; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa] +; AVX2-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] +; AVX2-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc3] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa] +; SKX-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] +; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SKX-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc3] +; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <4 x i16> %a0, %a1 + %sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1 + %sub = sub <4 x i16> %sel, %a1 + ret <4 x i16> %sub +} + +define <8 x i16> @test_x86_sse2_padds_w_fail(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w_fail: +; SSE: ## %bb.0: +; SSE-NEXT: xorps %xmm0, %xmm0 ## encoding: [0x0f,0x57,0xc0] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w_fail: +; AVX2: ## %bb.0: +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x57,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w_fail: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] +; SKX-NEXT: vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] +; SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = shufflevector <16 x i8> zeroinitializer, <16 x i8> undef, <8 x i32> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> zeroinitializer, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_x86_sse2_psubs_w_fail(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w_fail: +; SSE: ## %bb.0: +; SSE-NEXT: xorps %xmm0, %xmm0 ## encoding: [0x0f,0x57,0xc0] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w_fail: +; AVX2: ## %bb.0: +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x57,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w_fail: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] +; SKX-NEXT: vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] +; SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = shufflevector <16 x i8> zeroinitializer, <16 x i8> undef, <8 x i32> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = sub nuw nsw <8 x i32> zeroinitializer, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -145,14 +145,42 @@ ; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xec,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX512-LABEL: test_mm_adds_epi8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] -; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-AVX512-LABEL: test_mm_adds_epi8: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc0] +; X86-AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc9] +; X86-AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] +; X86-AVX512-NEXT: vpminsw {{\.LCPI.*}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmaxsw {{\.LCPI.*}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; X86-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-AVX512-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512-LABEL: test_mm_adds_epi8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc0] +; X64-AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc9] +; X64-AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] +; X64-AVX512-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; X64-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = sext <16 x i8> %arg0 to <16 x i16> + %2 = sext <16 x i8> %arg1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %bc = bitcast <16 x i8> %8 to <2 x i64> ret <2 x i64> %bc } declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone @@ -168,17 +196,44 @@ ; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xed,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX512-LABEL: test_mm_adds_epi16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] -; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-AVX512-LABEL: test_mm_adds_epi16: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc0] +; X86-AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc9] +; X86-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; X86-AVX512-NEXT: vpminsd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x39,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmaxsd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x3d,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] +; X86-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-AVX512-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512-LABEL: test_mm_adds_epi16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc0] +; X64-AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc9] +; X64-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; X64-AVX512-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x39,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x3d,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] +; X64-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = sext <8 x i16> %arg0 to <8 x i32> + %2 = sext <8 x i16> %arg1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %bc = bitcast <8 x i16> %8 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_adds_epu8: @@ -197,11 +252,12 @@ ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = add <16 x i8> %arg0, %arg1 + %2 = icmp ugt <16 x i8> %arg0, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %bc = bitcast <16 x i8> %3 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_adds_epu16: @@ -220,11 +276,12 @@ ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = add <8 x i16> %arg0, %arg1 + %2 = icmp ugt <8 x i16> %arg0, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %bc = bitcast <8 x i16> %3 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; SSE-LABEL: test_mm_and_pd: @@ -5886,17 +5943,44 @@ ; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xe8,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX512-LABEL: test_mm_subs_epi8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] -; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-AVX512-LABEL: test_mm_subs_epi8: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc0] +; X86-AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc9] +; X86-AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1] +; X86-AVX512-NEXT: vpminsw {{\.LCPI.*}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmaxsw {{\.LCPI.*}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; X86-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-AVX512-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512-LABEL: test_mm_subs_epi8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc0] +; X64-AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc9] +; X64-AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1] +; X64-AVX512-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; X64-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = sext <16 x i8> %arg0 to <16 x i16> + %2 = sext <16 x i8> %arg1 to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %bc = bitcast <16 x i8> %8 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_subs_epi16: @@ -5909,63 +5993,118 @@ ; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xe9,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX512-LABEL: test_mm_subs_epi16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] -; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-AVX512-LABEL: test_mm_subs_epi16: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc0] +; X86-AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc9] +; X86-AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1] +; X86-AVX512-NEXT: vpminsd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x39,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmaxsd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x3d,0x05,A,A,A,A] +; X86-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] +; X86-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-AVX512-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512-LABEL: test_mm_subs_epi16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc0] +; X64-AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc9] +; X64-AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1] +; X64-AVX512-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x39,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x3d,0x05,A,A,A,A] +; X64-AVX512-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] +; X64-AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = sext <8 x i16> %arg0 to <8 x i32> + %2 = sext <8 x i16> %arg1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %bc = bitcast <8 x i16> %8 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_subs_epu8: ; SSE: # %bb.0: -; SSE-NEXT: psubusb %xmm1, %xmm0 # encoding: [0x66,0x0f,0xd8,0xc1] +; SSE-NEXT: pmaxub %xmm1, %xmm0 # encoding: [0x66,0x0f,0xde,0xc1] +; SSE-NEXT: psubb %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf8,0xc1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_subs_epu8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xde,0xc1] +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_subs_epu8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xde,0xc1] +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf8,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %cmp = icmp ugt <16 x i8> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sub = sub <16 x i8> %sel, %arg1 + %bc = bitcast <16 x i8> %sub to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { -; SSE-LABEL: test_mm_subs_epu16: -; SSE: # %bb.0: -; SSE-NEXT: psubusw %xmm1, %xmm0 # encoding: [0x66,0x0f,0xd9,0xc1] -; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-SSE-LABEL: test_mm_subs_epu16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; X86-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-SSE-NEXT: movdqa %xmm1, %xmm3 # encoding: [0x66,0x0f,0x6f,0xd9] +; X86-SSE-NEXT: pxor %xmm2, %xmm3 # encoding: [0x66,0x0f,0xef,0xda] +; X86-SSE-NEXT: pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2] +; X86-SSE-NEXT: pmaxsw %xmm3, %xmm0 # encoding: [0x66,0x0f,0xee,0xc3] +; X86-SSE-NEXT: pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2] +; X86-SSE-NEXT: psubw %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf9,0xc1] +; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_subs_epu16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x3e,0xc1] +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf9,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_subs_epu16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; X64-SSE-LABEL: test_mm_subs_epu16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; X64-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT: movdqa %xmm1, %xmm3 # encoding: [0x66,0x0f,0x6f,0xd9] +; X64-SSE-NEXT: pxor %xmm2, %xmm3 # encoding: [0x66,0x0f,0xef,0xda] +; X64-SSE-NEXT: pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2] +; X64-SSE-NEXT: pmaxsw %xmm3, %xmm0 # encoding: [0x66,0x0f,0xee,0xc3] +; X64-SSE-NEXT: pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2] +; X64-SSE-NEXT: psubw %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf9,0xc1] +; X64-SSE-NEXT: retq # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %cmp = icmp ugt <8 x i16> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sub = sub <8 x i16> %sel, %arg1 + %bc = bitcast <8 x i16> %sub to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind { ; SSE-LABEL: test_mm_ucomieq_sd: Index: test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -882,3 +882,87 @@ ret <4 x float> %res } declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; SKX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; SKX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; SKX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; SKX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone Index: test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86.ll +++ test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -961,48 +961,6 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_paddus_b: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_paddus_b: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_paddus_w: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_paddus_w: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone - - define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## %bb.0: @@ -1565,48 +1523,6 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_psubus_b: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_psubus_b: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_psubus_w: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_psubus_w: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone - - define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) { ; SSE-LABEL: test_x86_sse2_ucomieq_sd: ; SSE: ## %bb.0: